diff --git a/Dockerfile b/Dockerfile index dcd0ea2..c1b3ff9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ RUN uv sync --frozen --no-dev FROM python:3.12-slim-bookworm +RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/* + WORKDIR /app # Copy the environment from the builder diff --git a/compose.yaml b/compose.yaml index 146a06e..900d7f9 100644 --- a/compose.yaml +++ b/compose.yaml @@ -4,9 +4,22 @@ include: services: schwab-mcp: <<: *mcp-service - <<: *mcp-healthcheck image: gitea.ext.ben.io/b3nw/schwab-mcp-custom:latest container_name: schwab-mcp + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 15s + deploy: + resources: + limits: + memory: 512M + cpus: '1.0' + reservations: + memory: 128M + cpus: '0.1' environment: - SCHWAB_PLAYWRIGHT_URL=ws://schwab-browser:3000/playwright/chromium - PORT=8000 diff --git a/schwab_scraper/__init__.py b/schwab_scraper/__init__.py deleted file mode 100644 index 79ecf85..0000000 --- a/schwab_scraper/__init__.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Public package exports sync wrappers and unified API references.""" - -from .api import ( - get_morningstar_data, - get_transaction_history, - get_transaction_history_enhanced, - list_accounts, - get_account_overview, - get_positions, - get_portfolio_snapshot, - refresh_session, - check_session_health, - get_session_status, - get_session_info, - ensure_valid_session, - export_cookies, - set_cookies, - list_available_accounts, -) - -__all__ = [ - "get_morningstar_data", - "get_transaction_history", - "get_transaction_history_enhanced", - "list_accounts", - "get_account_overview", - "get_positions", - "get_portfolio_snapshot", - "refresh_session", - "check_session_health", - "get_session_status", - "get_session_info", - "ensure_valid_session", - "export_cookies", - "set_cookies", - "list_available_accounts", -] diff --git a/schwab_scraper/__main__.py b/schwab_scraper/__main__.py deleted file mode 100644 index aebb5fc..0000000 --- a/schwab_scraper/__main__.py +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env python3 -"""Main entry point for the schwab-morningstar-scraper package when run with python3 -m.""" - -from .cli import main - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/schwab_scraper/api.py b/schwab_scraper/api.py deleted file mode 100644 index 0c6e815..0000000 --- a/schwab_scraper/api.py +++ /dev/null @@ -1,102 +0,0 @@ -import asyncio - -from . import unified_api -from .browser.session import get_session_info as _session_info - - -def get_morningstar_data(ticker: str, debug: bool = False): - """Synchronous wrapper for `unified_api.get_morningstar_data`""" - return asyncio.run(unified_api.get_morningstar_data(ticker, debug=debug)) - - -def get_transaction_history(account=None, start_date=None, end_date=None, time_period=None, debug=False): - """Synchronous wrapper for `unified_api.get_transaction_history`""" - return asyncio.run( - unified_api.get_transaction_history( - account=account, - start_date=start_date, - end_date=end_date, - time_period=time_period, - debug=debug, - ) - ) - - -def get_transaction_history_enhanced(account=None, start_date=None, end_date=None, time_period=None, debug=False): - """Synchronous wrapper for enhanced transaction history.""" - return asyncio.run( - unified_api.get_transaction_history_enhanced( - account=account, - start_date=start_date, - end_date=end_date, - time_period=time_period, - debug=debug, - ) - ) - - -def list_accounts(debug: bool = False): - """Synchronous wrapper for account discovery.""" - return asyncio.run(unified_api.list_accounts(debug=debug)) - - -def get_account_overview(account=None, debug: bool = False): - return asyncio.run(unified_api.get_account_overview(account=account, debug=debug)) - - -def get_positions(account=None, include_non_equity: bool = False, debug: bool = False): - return asyncio.run( - unified_api.get_positions( - account=account, - include_non_equity=include_non_equity, - debug=debug, - ) - ) - - -def get_portfolio_snapshot(account=None, aggregate_by_symbol: bool = True, include_non_equity: bool = False, debug: bool = False): - return asyncio.run( - unified_api.get_portfolio_snapshot( - account=account, - aggregate_by_symbol=aggregate_by_symbol, - include_non_equity=include_non_equity, - debug=debug, - ) - ) - - -def refresh_session(debug: bool = False): - return asyncio.run(unified_api.refresh_session(debug=debug)) - - -def check_session_health(debug: bool = False): - envelope = asyncio.run(unified_api.get_session_status(debug=debug)) - return envelope["success"] - - -def get_session_status(debug: bool = False): - return asyncio.run(unified_api.get_session_status(debug=debug)) - - -def get_session_info(debug: bool = False): - return _session_info() - - -def ensure_valid_session(debug: bool = False): - envelope = asyncio.run(unified_api.refresh_session(debug=debug)) - return envelope["success"] - - -def export_cookies(cookies_path: str, debug: bool = False): - """Synchronous wrapper for exporting cookies.""" - return asyncio.run(unified_api.export_cookies(cookies_path, debug=debug)) - - -def set_cookies(cookies_path: str, debug: bool = False): - """Synchronous wrapper for setting cookies.""" - return asyncio.run(unified_api.set_cookies(cookies_path, debug=debug)) - - -def list_available_accounts(debug: bool = False): - """Synchronous wrapper for listing available transaction accounts.""" - return asyncio.run(unified_api.list_available_accounts(debug=debug)) \ No newline at end of file diff --git a/schwab_scraper/browser/__init__.py b/schwab_scraper/browser/__init__.py deleted file mode 100644 index b80d354..0000000 --- a/schwab_scraper/browser/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from .client import connect, new_context, new_page -from .navigation import goto_with_auth_check -from .session import ( - export_cookies, - get_session_status, - refresh_session, - set_cookies_from_file, -) - -__all__ = [ - "connect", - "new_context", - "new_page", - "goto_with_auth_check", - "get_session_status", - "refresh_session", - "set_cookies_from_file", - "export_cookies", -] - diff --git a/schwab_scraper/browser/auth.py b/schwab_scraper/browser/auth.py deleted file mode 100644 index 70808d6..0000000 --- a/schwab_scraper/browser/auth.py +++ /dev/null @@ -1,1431 +0,0 @@ -import json -import os -import time -import logging -from typing import List, Dict, Any, Optional, Tuple -from playwright.async_api import async_playwright -from ..core.config import load_config, get_playwright_url, get_cookies_path -from ..utils.logging import save_debug_artifact - - -async def is_session_valid() -> bool: - """Check if current cookies.json contains a valid session - - This function validates that we have a truly valid session by checking: - 1. Multiple critical session cookies are present - 2. Those cookies haven't expired - 3. Session was established relatively recently (within 24 hours) - - Note: Cookie expiry times alone are not sufficient - Schwab may invalidate - sessions server-side. This function performs client-side validation only. - """ - logger = logging.getLogger(__name__) - cookies_path = get_cookies_path() - try: - with open(cookies_path, 'r') as f: - cookies = json.load(f) - if not cookies: - logger.debug("Session validation: No cookies found") - return False - - current_time = int(time.time()) - logger.debug(f"Session validation: Checking {len(cookies)} cookies") - - # CRITICAL session cookies - at least 3 of these should be present and valid - critical_session_cookies = { - 'auth': 'Primary authentication token', - 'ASP.NET_SessionId': 'Session ID', - 'NS2': 'Schwab session state', - 'LVAL': 'Login token', - '__RequestVerificationToken': 'CSRF token' - } - - # NON-CRITICAL cookies that may expire - non_critical_cookies = { - 'SessionInfo', - 'SS2', - 'O2', - 'sstate', - 'pstate' - } - - valid_critical_cookies = {} - validation_details = [] - - for cookie in cookies: - cookie_name = cookie.get('name', '') - expiry = cookie.get('expires', -1) - - if cookie_name in critical_session_cookies: - is_expired = expiry != -1 and expiry <= current_time - is_valid = expiry == -1 or (expiry and expiry > current_time) - - validation_details.append({ - 'name': cookie_name, - 'valid': is_valid, - 'expires': expiry, - 'expired': is_expired, - 'current_time': current_time - }) - - if is_valid: - valid_critical_cookies[cookie_name] = True - logger.debug(f"✓ Critical session cookie '{cookie_name}' is valid") - else: - logger.debug(f"✗ Critical session cookie '{cookie_name}' is expired (expires={expiry}, now={current_time})") - - # Require at least 3 critical cookies to be valid - min_required = 3 - has_valid_session = len(valid_critical_cookies) >= min_required - - if not has_valid_session: - logger.warning(f"Session validation FAILED: Only {len(valid_critical_cookies)} critical cookies valid (need ≥{min_required})") - for detail in validation_details: - logger.debug(f" {detail['name']}: {detail['valid']} (expires={detail['expires']})") - else: - logger.debug(f"✓ Session validation SUCCESS: {len(valid_critical_cookies)} critical cookies valid") - logger.debug(f" Valid cookies: {list(valid_critical_cookies.keys())}") - - return has_valid_session - - except (FileNotFoundError, json.JSONDecodeError) as e: - logger.debug(f"Session validation error: {e}") - return False - - -async def login_to_schwab(username: str, password: str) -> Optional[List[Dict[str, Any]]]: - """ - Perform automated login to Schwab using the remote browser (browserless). - On success, saves cookies to `cookies.json` and returns the cookies list. - Uses robust iframe detection and dynamic field detection. - - IMPORTANT: This function starts with a CLEAN SLATE - any existing stale cookies - are cleared before the login attempt. This prevents authentication failures from - mixing old session state with new credentials. - """ - import time - login_start_time = time.time() - logger = logging.getLogger(__name__) - - # CRITICAL: Clear any existing cookies before attempting fresh login - # Stale cookies can cause Schwab to reject the authentication - cookies_path = get_cookies_path() - try: - if os.path.exists(cookies_path): - os.remove(cookies_path) - logger.info(f"Cleared stale cookies file before fresh login: {cookies_path}") - except Exception as e: - logger.warning(f"Could not clear cookies file before login: {e}") - - config = load_config() - playwright_url = get_playwright_url(config) - - async with async_playwright() as p: - browser = await p.chromium.connect(playwright_url) - - # Create context with realistic headers and fingerprinting - context = await browser.new_context( - user_agent=( - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' - ), - viewport={'width': 1920, 'height': 1200}, - device_scale_factor=1.0, - locale='en-US', - timezone_id='America/New_York', - permissions=['geolocation', 'notifications'], - geolocation={'latitude': 40.7128, 'longitude': -74.0060}, - extra_http_headers={ - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept-Encoding': 'gzip, deflate, br', - 'Cache-Control': 'max-age=0', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', - 'Sec-Ch-Ua-Mobile': '?0', - 'Sec-Ch-Ua-Platform': '"Windows"', - 'Upgrade-Insecure-Requests': '1', - 'Dnt': '1', - }, - ) - - # Enhanced anti-detection script - await context.add_init_script( - ''' - // Core webdriver hiding - Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); - delete navigator.__proto__.webdriver; - - // Enhanced plugin spoofing - Object.defineProperty(navigator, 'plugins', { - get: () => [ - { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1 }, - { name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '', length: 1 }, - { name: 'Native Client', filename: 'internal-nacl-plugin', description: 'Native Client', length: 1 }, - ] - }); - - // Language and locale consistency - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - Object.defineProperty(navigator, 'language', { get: () => 'en-US' }); - - // Screen properties matching viewport - Object.defineProperty(screen, 'width', { get: () => 1920 }); - Object.defineProperty(screen, 'height', { get: () => 1080 }); - Object.defineProperty(screen, 'availWidth', { get: () => 1920 }); - Object.defineProperty(screen, 'availHeight', { get: () => 1040 }); - Object.defineProperty(screen, 'colorDepth', { get: () => 24 }); - Object.defineProperty(screen, 'pixelDepth', { get: () => 24 }); - - // Permission handling - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - - // Canvas fingerprinting resistance - const getContext = HTMLCanvasElement.prototype.getContext; - HTMLCanvasElement.prototype.getContext = function(type) { - const context = getContext.call(this, type); - if (type === '2d') { - const getImageData = context.getImageData; - context.getImageData = function(x, y, width, height) { - const imageData = getImageData.call(this, x, y, width, height); - // Add slight noise to canvas fingerprinting - for (let i = 0; i < imageData.data.length; i += 4) { - if (Math.random() < 0.1) { - imageData.data[i] += Math.floor(Math.random() * 10) - 5; - imageData.data[i + 1] += Math.floor(Math.random() * 10) - 5; - imageData.data[i + 2] += Math.floor(Math.random() * 10) - 5; - } - } - return imageData; - }; - } - return context; - }; - - // Hardware concurrency and memory - Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 }); - Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 }); - - // WebGL fingerprinting - const getParameter = WebGLRenderingContext.prototype.getParameter; - WebGLRenderingContext.prototype.getParameter = function(parameter) { - if (parameter === 37445) { return 'Intel Inc.'; } - if (parameter === 37446) { return 'Intel(R) HD Graphics 620'; } - if (parameter === 7936) { return 'WebKit'; } - if (parameter === 7937) { return 'WebKit WebGL'; } - return getParameter.call(this, parameter); - }; - - // Mouse movement tracking evasion - ['mousemove', 'mousedown', 'mouseup', 'click'].forEach(eventType => { - document.addEventListener(eventType, function(e) { - Object.defineProperty(e, 'isTrusted', { value: true, writable: false }); - }, true); - }); - - // Keyboard event evasion - ['keydown', 'keypress', 'keyup'].forEach(eventType => { - document.addEventListener(eventType, function(e) { - Object.defineProperty(e, 'isTrusted', { value: true, writable: false }); - }, true); - }); - - // Hide automation indicators - Object.defineProperty(window, 'chrome', { - get: () => ({ - runtime: {}, - loadTimes: function() {}, - csi: function() {}, - app: {} - }) - }); - - // Spoof connection type - Object.defineProperty(navigator, 'connection', { - get: () => ({ - effectiveType: '4g', - rtt: 100, - downlink: 10, - saveData: false - }) - }); - ''' - ) - - page = await context.new_page() - - # Track authentication API calls for debugging and success detection - auth_api_calls = [] - fpa_fixes = 0 - - async def log_request(request): - if 'api/v2/auth' in request.url: - endpoint = '/login' if '/login' in request.url else '/assert' - logger.debug(f"AUTH API REQUEST: {request.method} {endpoint}") - # Only log payload in verbose debug mode (when --debug is used twice) - if logger.getEffectiveLevel() <= 5 and request.post_data: # TRACE level - logger.debug(f"AUTH API PAYLOAD: {request.post_data[:200]}...") - - async def log_response(response): - if 'api/v2/auth' in response.url: - endpoint = 'login' if '/login' in response.url else 'assert' - logger.debug(f"AUTH API RESPONSE: {response.status} {endpoint}") - - # Track all auth API responses for success determination - auth_api_calls.append({ - 'url': response.url, - 'status': response.status, - 'endpoint': endpoint - }) - - try: - response_text = await response.text() - - # Check for authentication failure indicators - if response.status == 403 or 'Access Denied' in response_text: - logger.debug(f"❌ AUTHENTICATION FAILED: {response.status} - {endpoint}") - elif response.status == 200 and '/assert' in response.url: - # Check if 2FA is pending - if 'wait_for_approval' in response_text or 'mobile_approve' in response_text: - logger.debug("📱 2FA mobile approval requested") - elif 'External process is pending' in response_text: - logger.debug("📱 Waiting for 2FA approval...") - - except: - logger.debug("AUTH API RESPONSE BODY: [could not read]") - - # Intercept and modify auth API requests to fix FPA parameter - async def intercept_auth_request(route, request): - nonlocal fpa_fixes - if 'api/v2/auth' in request.url and 'FPA=false' in request.url: - # Fix the FPA parameter from false to true - modified_url = request.url.replace('FPA=false', 'FPA=true') - fpa_fixes += 1 - if fpa_fixes == 1: # Only log the first fix - logger.debug("Fixed FPA parameter for authentication requests") - await route.continue_(url=modified_url) - else: - await route.continue_() - - # Route auth API calls through our interceptor - await page.route('**/api/v2/auth/**', intercept_auth_request) - - page.on('request', log_request) - page.on('response', log_response) - - try: - logger.debug("Navigating to Schwab login page…") - await page.goto("https://client.schwab.com/Areas/Access/Login", timeout=60000) - - logger.debug("Waiting for login iframe…") - iframe_element, iframe = await robust_iframe_wait(page) - if not iframe or not iframe_element: - logger.error("Could not access login iframe") - raise Exception("Could not access login iframe") - - logger.debug("Accessed iframe content") - iframe = await resolve_login_inner_frame(iframe) - - logger.debug("Waiting for login form…") - try: - await iframe.wait_for_load_state('domcontentloaded', timeout=15000) - logger.debug("Login form DOM loaded") - try: - await iframe.wait_for_load_state('networkidle', timeout=5000) - logger.debug("Login form network idle achieved") - except Exception: - logger.debug("Network idle timeout, proceeding") - except Exception as e: - logger.debug(f"DOM load timeout: {e}") - - logger.debug("Finding login fields…") - username_field, password_field = await find_login_fields_dynamically(iframe) - logger.debug(f"Dynamic detection result - username: {username_field}, password: {password_field}") - - if not username_field or not password_field: - logger.debug("Dynamic detection failed; falling back to heuristics") - # Basic fallbacks - fallback_user = [ - 'input[autocomplete="username"]', - 'input[type="text"][id="loginIdInput"]', - 'input[type="text"][placeholder*="Login ID"]', - 'input[name*="login"]', 'input[id*="login"]', 'input[type="text"]' - ] - fallback_pwd = [ - 'input[autocomplete="current-password"]', - 'input[type="password"][id="passwordInput"]', - 'input[type="password"][placeholder*="Password"]', - 'input[name*="password"]', 'input[id*="password"]', 'input[type="password"]' - ] - - original_username = username_field - original_password = password_field - - for sel in fallback_user: - try: - if await iframe.is_visible(sel): - username_field = sel - logger.debug(f"Fallback username field found: {sel}") - break - except Exception: - pass - - for sel in fallback_pwd: - try: - if await iframe.is_visible(sel): - password_field = sel - logger.debug(f"Fallback password field found: {sel}") - break - except Exception: - pass - - logger.debug(f"After fallback - username: {username_field}, password: {password_field}") - - if not username_field or not password_field: - # Dump all input fields for debugging - try: - all_inputs = await iframe.query_selector_all('input') - logger.debug(f"Found {len(all_inputs)} total input fields:") - for i, inp in enumerate(all_inputs): - try: - input_type = await inp.get_attribute('type') or 'text' - input_id = await inp.get_attribute('id') or '' - input_name = await inp.get_attribute('name') or '' - input_placeholder = await inp.get_attribute('placeholder') or '' - input_autocomplete = await inp.get_attribute('autocomplete') or '' - is_visible = await inp.is_visible() - logger.debug(f" Input {i}: type='{input_type}', id='{input_id}', name='{input_name}', placeholder='{input_placeholder}', autocomplete='{input_autocomplete}', visible={is_visible}") - except Exception: - pass - except Exception as e: - logger.debug(f"Could not enumerate input fields: {e}") - - raise Exception("Login fields not found") - - logger.debug("Filling credentials…") - - # Debug: Check what fields we're actually targeting - try: - username_element = await iframe.query_selector(username_field) - password_element = await iframe.query_selector(password_field) - - if username_element: - username_attrs = await username_element.evaluate('el => ({ id: el.id, name: el.name, type: el.type, placeholder: el.placeholder })') - logger.debug(f"Username field attributes: {username_attrs}") - else: - logger.debug(f"Username field not found with selector: {username_field}") - - if password_element: - password_attrs = await password_element.evaluate('el => ({ id: el.id, name: el.name, type: el.type, placeholder: el.placeholder })') - logger.debug(f"Password field attributes: {password_attrs}") - else: - logger.debug(f"Password field not found with selector: {password_field}") - except Exception as e: - logger.debug(f"Error checking field attributes: {e}") - - # Fill credentials using the original working approach (reverted from git history) - logger.debug("Filling credentials…") - try: - await iframe.fill(username_field, '') - await iframe.fill(username_field, username) - logger.debug("Username filled using fill()") - except Exception as e: - logger.debug(f"Username fill failed: {e}, trying click+type fallback") - try: - await iframe.click(username_field, timeout=5000) - await iframe.type(username_field, username, delay=25) - logger.debug("Username filled using click+type fallback") - except Exception as e2: - logger.debug(f"Username click+type also failed: {e2}") - - try: - await iframe.fill(password_field, '') - await iframe.fill(password_field, password) - logger.debug("Password filled using fill()") - except Exception as e: - logger.debug(f"Password fill failed: {e}, trying click+type fallback") - try: - await iframe.click(password_field, timeout=5000) - await iframe.type(password_field, password, delay=25) - logger.debug("Password filled using click+type fallback") - except Exception as e2: - logger.debug(f"Password click+type also failed: {e2}") - - # Verify filled values (original approach) - try: - user_val = await iframe.input_value(username_field) - _ = len(await iframe.input_value(password_field)) - logger.debug(f"Credentials filled (username len={len(user_val)})") - except Exception: - logger.debug("Could not verify input values; proceeding") - - # Find submit button - submit_selectors = [ - 'button[type="submit"]', 'input[type="submit"]', - 'button:has-text("Log In")', 'button:has-text("Sign In")', 'button:has-text("Continue")', - '[role="button"]:has-text("Log In")', '[role="button"]' - ] - submit_button = None - for sel in submit_selectors: - try: - if await iframe.is_visible(sel): - submit_button = sel; break - except Exception: - pass - if not submit_button: - raise Exception("Submit button not found") - - # Wait for page JavaScript to fully initialize before submission - # The HAR shows that successful logins require the frontend JS to be ready - # and the FPA parameter to be set to true (not false) - logger.debug("Waiting for authentication JavaScript to initialize...") - try: - await iframe.wait_for_function( - '''() => { - // Check if authentication-related JavaScript objects are available - return window.fetch !== undefined && - document.readyState === 'complete' && - (window.crypto !== undefined || window.msCrypto !== undefined); - }''', - timeout=10000 - ) - logger.debug("Authentication JavaScript appears ready") - except Exception as e: - logger.debug(f"JavaScript readiness check failed: {e}, proceeding anyway") - - # Additional wait to ensure all JavaScript is loaded, including FPA initialization - await page.wait_for_timeout(3000) - - # Try to trigger FPA=true by ensuring all fraud prevention scripts are loaded - try: - await iframe.evaluate(''' - () => { - // Try to trigger any deferred authentication scripts - if (window.dispatchEvent) { - window.dispatchEvent(new Event('load')); - window.dispatchEvent(new Event('DOMContentLoaded')); - } - - // Allow time for fraud prevention analytics to initialize - return true; - } - ''') - logger.debug("Triggered fraud prevention analytics initialization") - - # Wait longer for FPA to be set to true - await page.wait_for_timeout(2000) - - except Exception as e: - logger.debug(f"FPA initialization failed: {e}, proceeding anyway") - - logger.debug("Submitting login form…") - - # Ensure form submission triggers proper JavaScript events - # The HAR shows that successful login triggers /api/v2/auth/login with device fingerprinting - async with page.expect_response( - lambda response: ( - 'sws-gateway-nr.schwab.com/api/v2/auth' in response.url or - 'client.schwab.com/Areas/Access/SignOn/Auth' in response.url - ), timeout=60000 - ) as response_info: - try: - # First try: Trigger form submission via JavaScript to ensure events fire - logger.debug("Attempting JavaScript form submission to trigger auth API calls...") - await iframe.evaluate(''' - () => { - const form = document.querySelector('form'); - if (form) { - // Dispatch input events to ensure form validation - const inputs = form.querySelectorAll('input'); - inputs.forEach(input => { - input.dispatchEvent(new Event('input', { bubbles: true })); - input.dispatchEvent(new Event('change', { bubbles: true })); - }); - - // Trigger form submission - form.dispatchEvent(new Event('submit', { bubbles: true, cancelable: true })); - return true; - } - return false; - } - ''') - - # Wait a moment for JavaScript processing - await page.wait_for_timeout(1000) - - # Then click the submit button to ensure UI state changes - await iframe.click(submit_button) - logger.debug("Submit button clicked after JavaScript events") - - except Exception as e: - logger.debug(f"JavaScript submission failed: {e}, trying fallback methods") - try: - await iframe.press(password_field, 'Enter') - logger.debug("Enter key pressed") - except Exception: - await iframe.click(submit_button, force=True) - logger.debug("Force click attempted") - - # Wait for all authentication API calls to complete - await page.wait_for_timeout(5000) - - try: - response = await response_info.value - logger.debug(f"Primary authentication response: {response.status} - {response.url}") - except Exception as e: - logger.debug(f"Response monitoring error: {e}") - - # Analyze authentication API calls to determine success/failure - logger.debug(f"Analyzing {len(auth_api_calls)} authentication API calls...") - - login_success = False - assert_success = False - auth_failed = False - - for call in auth_api_calls: - if call['endpoint'] == 'login' and call['status'] == 200: - login_success = True - elif call['endpoint'] == 'assert' and call['status'] == 200: - assert_success = True - elif call['status'] == 403: - auth_failed = True - - # Determine overall authentication status - if auth_failed: - logger.debug("❌ Authentication failed: 403 Access Denied") - elif login_success and assert_success: - logger.debug("✅ Authentication successful - proceeding to 2FA flow") - await page.wait_for_timeout(3000) - elif login_success and not assert_success: - logger.debug("⚠️ Partial success - waiting for password validation") - await page.wait_for_timeout(3000) - else: - logger.debug("❓ Authentication status unclear") - - # Quick check for login errors after submission - await page.wait_for_timeout(2000) - try: - iframe_element = await page.query_selector('#lmsIframe') - if iframe_element: - iframe_check = await iframe_element.content_frame() - if iframe_check: - # Look for error messages - error_text = await iframe_check.evaluate('''() => { - const errorElements = document.querySelectorAll('[style*="color: red"], .error, .alert-danger'); - for (let el of errorElements) { - const text = el.textContent.trim(); - if (text && (text.toLowerCase().includes('invalid') || text.toLowerCase().includes('incorrect'))) { - return text; - } - } - return null; - }''') - if error_text: - logger.error(f"Login failed with error: {error_text}") - await browser.close() - return None - except Exception as e: - logger.debug(f"Error check failed: {e}") - - # OAuth flow wait with enhanced detection - logger.debug("Waiting for OAuth authorization flow…") - try: - await page.wait_for_function( - '''() => { - const mainUrl = window.location.href; - console.log('OAuth wait check - Current URL:', mainUrl); - - // Check for direct success patterns first - const successPatterns = [ - '/summary', '/app/', '/Apps/', '/accounts/', '/Areas/Accounts', - '/clientapps/accounts', '/positions', '/portfolio' - ]; - if (successPatterns.some(pattern => mainUrl.includes(pattern))) { - console.log('Direct success redirect detected:', mainUrl); - return true; - } - - // Check iframe src for auth flow - const iframe = document.querySelector('#lmsIframe'); - if (!iframe) { - console.log('No iframe found, checking for redirect...'); - return false; - } - - const iframeSrc = iframe.getAttribute('src'); - console.log('Iframe src:', iframeSrc); - - if (iframeSrc && (iframeSrc.includes('SignOn/Auth') || iframeSrc.includes('code=') || iframeSrc.includes('redirecturi='))) { - console.log('OAuth iframe detected:', iframeSrc); - return true; - } - - return false; - }''', timeout=30000 - ) - logger.debug("OAuth authorization flow detected successfully") - - # Now wait for OAuth completion - check iframe content and try to interact - logger.debug("Waiting for OAuth flow completion...") - - # Give iframe time to load OAuth content - await page.wait_for_timeout(3000) - - # Try to interact with OAuth consent screen in iframe if present - try: - iframe_element = await page.query_selector('#lmsIframe') - if iframe_element: - iframe = await iframe_element.content_frame() - if iframe: - # Wait for iframe to load - await iframe.wait_for_load_state('domcontentloaded', timeout=10000) - - # Debug: check what's in the iframe - try: - iframe_url = iframe.url - iframe_title = await iframe.title() - logger.debug(f"OAuth iframe loaded - URL: {iframe_url}, Title: {iframe_title}") - - # Check if this iframe is showing a login form that needs credentials - login_form_check = await iframe.evaluate('''() => { - const usernameFields = document.querySelectorAll('input[type="text"], input[id*="login"], input[name*="login"], input[placeholder*="login"]'); - const passwordFields = document.querySelectorAll('input[type="password"]'); - const errorElements = document.querySelectorAll('.error, [class*="error"], [class*="invalid"]'); - - return { - hasUsernameField: usernameFields.length > 0, - hasPasswordField: passwordFields.length > 0, - errorCount: errorElements.length, - errorMessages: Array.from(errorElements).map(el => el.textContent.trim()), - pageText: document.body.textContent.trim().substring(0, 200) - }; - }''') - logger.debug(f"OAuth iframe form analysis: {login_form_check}") - - # If this is a separate login form, try to fill it - if login_form_check['hasUsernameField'] and login_form_check['hasPasswordField']: - logger.debug("OAuth iframe has separate login form - attempting to fill credentials") - - # Try to find and fill fields in OAuth iframe - try: - oauth_username_selectors = [ - 'input[type="text"]', 'input[id*="login"]', 'input[name*="login"]', - 'input[placeholder*="login"]', 'input[autocomplete="username"]' - ] - oauth_password_selectors = [ - 'input[type="password"]', 'input[id*="password"]', 'input[name*="password"]' - ] - - # Fill username in OAuth iframe - for sel in oauth_username_selectors: - try: - if await iframe.is_visible(sel): - await iframe.fill(sel, username) - logger.debug(f"Filled OAuth username field: {sel}") - break - except Exception: - pass - - # Fill password in OAuth iframe - for sel in oauth_password_selectors: - try: - if await iframe.is_visible(sel): - await iframe.fill(sel, password) - logger.debug(f"Filled OAuth password field: {sel}") - break - except Exception: - pass - - await page.wait_for_timeout(1000) - - # Now submit the OAuth iframe form - oauth_submit_selectors = [ - 'button[type="submit"]', 'input[type="submit"]', - 'button:has-text("Log in")', 'button:has-text("Log In")', - 'button:has-text("Sign in")', 'button:has-text("Sign In")', - 'button:has-text("Continue")', 'button' - ] - - for submit_sel in oauth_submit_selectors: - try: - if await iframe.is_visible(submit_sel): - button_text = await iframe.text_content(submit_sel) - logger.debug(f"Submitting OAuth iframe form with button: {submit_sel} (text: {button_text})") - await iframe.click(submit_sel) - await page.wait_for_timeout(2000) - break - except Exception: - pass - - # Check if the error disappeared after submitting and look for next steps - try: - await page.wait_for_timeout(3000) # Wait for form processing - post_submit_check = await iframe.evaluate('''() => { - const errorElements = document.querySelectorAll('.error, [class*="error"], [class*="invalid"]'); - const errorText = Array.from(errorElements).map(el => el.textContent.trim()).join(' '); - - // Look for "Having trouble" buttons - const buttons = Array.from(document.querySelectorAll('button, a, [role="button"]')); - const buttonTexts = buttons.map(btn => ({ - text: btn.textContent.trim(), - tag: btn.tagName.toLowerCase(), - visible: btn.offsetParent !== null - })).filter(btn => btn.visible); - - return { - hasErrors: errorElements.length > 0, - errorText: errorText, - currentUrl: window.location.href, - availableButtons: buttonTexts - }; - }''') - logger.debug(f"OAuth iframe post-submit status: {post_submit_check}") - - # If we see "Having trouble" text, try to click the "No, I'll try" button - if 'Having trouble' in post_submit_check.get('errorText', '') or any('trouble' in btn['text'].lower() for btn in post_submit_check.get('availableButtons', [])): - logger.debug("Found 'Having trouble' page, looking for bypass button...") - - trouble_selectors = [ - "button:has-text(\"No, I'll try\")", - 'button:has-text("No, I\'ll try")', - 'button:has-text("try")', - "a:has-text(\"No, I'll try\")", - 'a:has-text("No, I\'ll try")', - '[role="button"]:has-text("try")' - ] - - for trouble_sel in trouble_selectors: - try: - if await iframe.is_visible(trouble_sel): - button_text = await iframe.text_content(trouble_sel) - logger.debug(f"Clicking trouble bypass button: {trouble_sel} (text: {button_text})") - await iframe.click(trouble_sel) - await page.wait_for_timeout(3000) - break - except Exception: - pass - - except Exception: - pass - - except Exception as oauth_fill_error: - logger.debug(f"Error filling OAuth iframe credentials: {oauth_fill_error}") - - # Get all visible elements for debugging - visible_elements = await iframe.evaluate('''() => { - const elements = []; - document.querySelectorAll('*').forEach(el => { - if (el.offsetParent !== null && el.textContent.trim()) { - const rect = el.getBoundingClientRect(); - if (rect.width > 0 && rect.height > 0) { - elements.push({ - tag: el.tagName.toLowerCase(), - text: el.textContent.trim().substring(0, 100), - type: el.type || '', - id: el.id || '', - className: el.className || '' - }); - } - } - }); - return elements.slice(0, 10); // Limit to first 10 visible elements - }''') - logger.debug(f"Visible elements in OAuth iframe: {visible_elements}") - - except Exception as debug_error: - logger.debug(f"Error debugging iframe content: {debug_error}") - - # Check for OAuth consent buttons and specific Schwab flow buttons - consent_selectors = [ - 'button:has-text("No, I\'ll try")', # Schwab account assistance bypass - 'button:has-text("Continue")', 'button:has-text("Allow")', - 'button:has-text("Accept")', 'button:has-text("Approve")', - 'input[type="submit"]', 'button[type="submit"]', - 'button', 'input[type="button"]' # Add generic button selectors - ] - - for sel in consent_selectors: - try: - if await iframe.is_visible(sel): - button_text = await iframe.text_content(sel) - logger.debug(f"Found clickable element: {sel} with text: {button_text}") - await iframe.click(sel) - logger.debug(f"Clicked OAuth element: {sel}") - await page.wait_for_timeout(2000) - break - except Exception: - pass - except Exception as e: - logger.debug(f"Error interacting with OAuth iframe: {e}") - - # Now wait for completion - try: - await page.wait_for_function( - '''() => { - const mainUrl = window.location.href; - console.log('OAuth completion check - Current URL:', mainUrl); - - // Check if main page redirected to success - const successPatterns = [ - '/summary', '/app/', '/Apps/', '/accounts/', '/Areas/Accounts', - '/clientapps/accounts', '/positions', '/portfolio' - ]; - if (successPatterns.some(pattern => mainUrl.includes(pattern))) { - console.log('Main page redirected to success:', mainUrl); - return true; - } - - // Check if iframe has navigated to 2FA/authenticators - const iframe = document.querySelector('#lmsIframe'); - if (iframe) { - const iframeSrc = iframe.getAttribute('src'); - console.log('OAuth completion iframe src:', iframeSrc); - if (iframeSrc && iframeSrc.includes('authenticators')) { - console.log('2FA/authenticators detected'); - return true; - } - } - - return false; - }''', timeout=30000 - ) - logger.debug("OAuth flow completion detected") - except Exception as completion_error: - logger.debug(f"OAuth completion timeout: {completion_error}") - - # Capture debug artifacts on OAuth timeout - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_oauth_timeout.png", png) - html = await page.content() - save_debug_artifact("debug_oauth_timeout.html", html) - - # Try to get iframe content as well - iframe_element = await page.query_selector('#lmsIframe') - if iframe_element: - iframe = await iframe_element.content_frame() - if iframe: - iframe_html = await iframe.content() - save_debug_artifact("debug_oauth_iframe.html", iframe_html) - iframe_png = await iframe.screenshot() - save_debug_artifact("debug_oauth_iframe.png", iframe_png) - - logger.debug("OAuth timeout debug artifacts saved") - except Exception: - pass - - except Exception as e: - logger.debug(f"OAuth flow monitoring error: {e}") - - # Check current URL and iframe after OAuth flow - current_url = page.url - logger.debug(f"Final URL check after OAuth: {current_url}") - - # Check iframe content for 2FA or completion status - try: - iframe_element = await page.query_selector('#lmsIframe') - if iframe_element: - iframe_src = await iframe_element.get_attribute('src') - logger.debug(f"Final iframe src: {iframe_src}") - if iframe_src and 'authenticators' in iframe_src: - logger.debug("2FA/authenticators page detected - updating current_url for 2FA handling") - current_url = iframe_src # Set current_url to iframe src for 2FA detection - except Exception: - pass - - if 'authenticators' not in current_url: - current_url = page.url - logger.debug(f"Current URL after OAuth flow: {current_url}") - - # Fast success - if any(p in current_url for p in ['/clientapps/accounts', '/accounts/', '/app/', '/Apps/', '/Areas/Accounts', '/summary']): - cookies = await context.cookies() - # Convert Cookie objects to dictionaries for JSON serialization - cookie_dicts = [ - { - 'name': cookie.get('name', ''), - 'value': cookie.get('value', ''), - 'domain': cookie.get('domain', ''), - 'path': cookie.get('path', ''), - 'expires': cookie.get('expires', -1), - 'httpOnly': cookie.get('httpOnly', False), - 'secure': cookie.get('secure', False), - 'sameSite': cookie.get('sameSite', 'Lax') - } - for cookie in cookies - ] - cookies_path = get_cookies_path() - with open(cookies_path, 'w') as f: - json.dump(cookie_dicts, f, indent=2) - - # Log authentication summary - login_duration = time.time() - login_start_time - logger.debug("OAuth success; cookies saved") - logger.debug(f"Login completed in {login_duration:.1f}s, {len(auth_api_calls)} API calls, {fpa_fixes} FPA fixes") - - await browser.close() - return cookie_dicts - - # Authenticators page (2FA) - if 'authenticators' in current_url or 'otp/code' in current_url: - print("\n" + "="*70) - print("📱 MFA APPROVAL REQUIRED") - print("="*70) - print("Attempting to intercept n8n webhook for SMS text code...") - - logger.info("Checking for SMS/Text message option...") - try: - target = page - iframe_element = await page.query_selector('#lmsIframe') - if iframe_element: - target = await iframe_element.content_frame() or page - - sms_button = await target.query_selector('button:has-text("Text message"), button:has-text("SMS"), :text-matches("Text message", "i"), :text-matches("SMS", "i")') - if sms_button: - logger.info("Clicking the SMS/Text message option to send code...") - await sms_button.click() - await page.wait_for_timeout(2000) - continue_btn = await target.query_selector('button:has-text("Continue"), button:has-text("Next")') - if continue_btn: - await continue_btn.click() - await page.wait_for_timeout(2000) - except Exception as e: - logger.debug(f"Could not automatically click SMS option (maybe already sent code): {e}") - - logger.info("Polling n8n webhook for MFA code (up to 2 minutes)…") - import aiohttp - import asyncio - mfa_code = None - - try: - logger.info("Waiting 5 seconds for email code to arrive before checking webhook...") - await asyncio.sleep(5) - async with aiohttp.ClientSession() as session: - for attempt in range(2): - print(f"Checking webhook for code (attempt {attempt + 1}/2)...") - try: - async with session.get("https://n8n.ext.ben.io/webhook/schwab-token") as resp: - if resp.status == 200: - data = await resp.json() - if data: - # Parse based on expected n8n output formats - code = None - if isinstance(data, dict): - code = data.get("code") or data.get("token") or data.get("login_code") or data.get("body", {}).get("code") - elif isinstance(data, list) and len(data) > 0: - code = data[-1].get("code") or data[-1].get("token") or data[-1].get("login_code") - if code: - mfa_code = code - logger.info(f"Got MFA code from webhook: {mfa_code}") - break - else: - logger.warning("Webhook returned data but no code found inside.") - else: - logger.warning(f"Webhook returned status code {resp.status}") - except Exception as e: - logger.debug(f"Webhook poll error: {e}") - - if not mfa_code and attempt == 0: - logger.info("Token not found, waiting 10 seconds before 1 retry...") - await asyncio.sleep(10) - except Exception as loop_e: - logger.error(f"Error during webhook checking: {loop_e}") - - if mfa_code: - logger.info("Entering MFA code into form...") - try: - # When on the sws-gateway-nr OTP page, the form is rendered - # directly on the page — there is no #lmsIframe wrapper here. - # Only look for the iframe when on the client.schwab.com login page. - current_page_url = page.url - if 'sws-gateway-nr' in current_page_url or 'otp' in current_page_url: - logger.debug(f"OTP page detected ({current_page_url}), querying form directly on page") - target = page - else: - target = page - iframe_element = await page.query_selector('#lmsIframe') - if iframe_element: - target = await iframe_element.content_frame() or page - - # Commonly used ids and attributes for OTP inputs on Schwab - code_input = await target.query_selector('input[type="text"], input[type="tel"], input[name*="code" i], input[id*="code" i], input[autocomplete*="one-time-code" i]') - if code_input: - await code_input.fill(str(mfa_code)) - logger.info(f"Filled OTP field with code: {mfa_code}") - - # Sometimes the submit button specifically says 'Trust device' or similar - submit_btn = await target.query_selector('button[type="submit"], button:has-text("Continue"), button:has-text("Verify"), button:has-text("Submit"), button:has-text("Log in"), button[id*="submit"], button[id*="continue"]') - if submit_btn: - await submit_btn.click() - print("Submitted MFA code successfully.") - await page.wait_for_timeout(5000) - else: - logger.warning("Submit button not found after filling OTP — waiting anyway") - await page.wait_for_timeout(5000) - else: - logger.error("OTP input field not found on page") - except Exception as e: - logger.error(f"Failed to enter MFA code: {e}") - - try: - await page.wait_for_function( - '''() => { - const url = window.location.href; - console.log('2FA wait check - Current URL:', url); - - // More comprehensive URL patterns for Schwab success pages - const successPatterns = [ - 'SignOn/Auth', # OAuth auth code stage - '/app/', # Main app - '/Apps/', # Alternative app path - '/accounts/', # Accounts page - '/Areas/Accounts', # Alternative accounts path - '/summary', # Account summary - '/clientapps/accounts', # Client apps accounts - '/positions', # Positions page - '/portfolio' # Portfolio page - ]; - - const success = successPatterns.some(pattern => url.includes(pattern)); - if (success) { - console.log('2FA wait completed successfully - URL changed to:', url); - } - return success; - }''', timeout=60000 - ) - logger.debug("2FA flow completed/detected successfully") - except Exception as e: - logger.error(f"2FA timeout or error: {e}") - current_url_after_timeout = page.url - logger.debug(f"URL after 2FA timeout: {current_url_after_timeout}") - - # Check if we're actually on a success page despite the timeout - success_patterns = ['/app/', '/Apps/', '/accounts/', '/Areas/Accounts', '/summary', '/clientapps/accounts', '/positions', '/portfolio'] - if any(pattern in current_url_after_timeout for pattern in success_patterns): - logger.info("2FA timeout, but URL indicates success - continuing") - else: - # Capture debug artifacts on 2FA failure - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_2fa_timeout.png", png) - html = await page.content() - save_debug_artifact("debug_2fa_timeout.html", html) - logger.debug("2FA timeout debug artifacts saved") - except Exception: - pass - - # Try one more time with a shorter timeout to see if page redirected - logger.info("Attempting 2FA recovery check...") - try: - await page.wait_for_function( - '''() => { - const url = window.location.href; - const successPatterns = ['/app/', '/Apps/', '/accounts/', '/Areas/Accounts', '/summary', '/clientapps/accounts', '/positions', '/portfolio']; - return successPatterns.some(pattern => url.includes(pattern)); - }''', timeout=10000 - ) - logger.info("2FA recovery successful") - except Exception: - logger.error("2FA recovery failed - login unsuccessful") - # Clean up bad cookies on MFA failure to prevent bad state - cookies_path = get_cookies_path() - try: - logger.warning("Removing invalid cookies after MFA failure to prevent bad state") - if os.path.exists(cookies_path): - os.remove(cookies_path) - logger.debug(f"Removed invalid cookies at {cookies_path}") - except Exception as cleanup_error: - logger.error(f"Failed to clean up cookies: {cleanup_error}") - raise - - # Authorization code stage - elif 'SignOn/Auth' in current_url: - try: - await page.wait_for_function( - '''() => { - const url = window.location.href; - return url.includes('/app/') || url.includes('/Apps/') || url.includes('/accounts/') || url.includes('/Areas/Accounts'); - }''', timeout=60000 - ) - except Exception: - logger.debug("OAuth token exchange timeout; attempting to continue") - # Try clicking continue/accept if present - try: - await page.wait_for_selector('button, input[type="submit"], a[href*="app"]', timeout=10000) - for sel in ['button:has-text("Continue")', 'button:has-text("Accept")', 'button:has-text("Allow")', 'input[type="submit"]', 'a[href*="/app/"]']: - try: - if await page.is_visible(sel): - await page.click(sel) - break - except Exception: - pass - except Exception: - pass - - # Finalize - try: - await page.wait_for_load_state('domcontentloaded', timeout=5000) - except Exception: - pass - - final_url = page.url - logger.debug(f"Final URL after OAuth flow: {final_url}") - if any(p in final_url for p in ['/app/', '/Apps/', '/accounts/', '/Areas/Accounts']): - cookies = await context.cookies() - # Convert Cookie objects to dictionaries for JSON serialization - cookie_dicts = [ - { - 'name': cookie.get('name', ''), - 'value': cookie.get('value', ''), - 'domain': cookie.get('domain', ''), - 'path': cookie.get('path', ''), - 'expires': cookie.get('expires', -1), - 'httpOnly': cookie.get('httpOnly', False), - 'secure': cookie.get('secure', False), - 'sameSite': cookie.get('sameSite', 'Lax') - } - for cookie in cookies - ] - cookies_path = get_cookies_path() - with open(cookies_path, 'w') as f: - json.dump(cookie_dicts, f, indent=2) - logger.debug("OAuth success; cookies saved") - await browser.close() - return cookie_dicts - - except Exception as e: - logger.error(f"Login error: {e}") - # Failure path: capture artifacts - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_oauth_failed.png", png) - html = await page.content() - save_debug_artifact("debug_oauth_failed.html", html) - except Exception: - pass - - # Clean up bad cookies on login failure to prevent bad state - cookies_path = get_cookies_path() - try: - logger.warning("Removing invalid cookies after login failure to prevent bad state") - if os.path.exists(cookies_path): - os.remove(cookies_path) - logger.debug(f"Removed invalid cookies at {cookies_path}") - except Exception as cleanup_error: - logger.error(f"Failed to clean up cookies: {cleanup_error}") - - await browser.close() - return None - - -async def ensure_cookies() -> Optional[List[Dict[str, Any]]]: - """Shared helper to ensure we have valid cookies. - - Attempts to use existing `cookies.json` if it appears valid; otherwise performs - automated login using credentials from `config.json` when available. - - IMPORTANT: Stale cookies can cause authentication failures even if they haven't - technically expired. This function implements: - 1. Client-side validation (expiry time checks) - 2. Fallback to fresh login if validation fails - 3. Automatic cleanup of stale cookies on login attempt - """ - logger = logging.getLogger(__name__) - cookies_path = get_cookies_path() - - # Try existing cookies if they appear to contain a valid session - try: - if await is_session_valid(): - logger.debug("Existing cookies appear valid, attempting to load...") - try: - with open(cookies_path, 'r') as f: - cookies = json.load(f) - if cookies: - logger.info(f"Using {len(cookies)} cached cookies from disk") - return cookies - except (FileNotFoundError, json.JSONDecodeError): - logger.debug("Could not load valid cookies from disk") - except Exception as e: - logger.debug(f"Cookie validation failed: {e}") - - # If we reach here, existing cookies are not valid - logger.info("Existing cookies not valid or not found. Attempting fresh login...") - - # Attempt automated login using config credentials - try: - from ..core.config import load_config, get_schwab_credentials - config = load_config() - username, password = get_schwab_credentials(config) - if username and password: - # IMPORTANT: Clear stale cookies before attempting new login - # This prevents authentication failures from mixing old session state with new credentials - try: - if os.path.exists(cookies_path): - logger.debug(f"Clearing stale cookies before fresh login attempt: {cookies_path}") - os.remove(cookies_path) - except Exception as cleanup_error: - logger.warning(f"Failed to clear stale cookies: {cleanup_error}") - - logger.info("Starting fresh login process...") - cookies = await login_to_schwab(username, password) - if cookies: - logger.info(f"Fresh login successful, obtained {len(cookies)} cookies") - return cookies - else: - logger.error("Fresh login failed to produce cookies") - except Exception as e: - logger.error(f"Login attempt failed: {e}") - - logger.error("Unable to establish valid session") - return None - - -# ----- Helpers migrated from legacy scraper ----- -async def find_login_fields_dynamically(iframe) -> Tuple[Optional[str], Optional[str]]: - """Try multiple strategies to find username/password fields inside iframe.""" - logger = logging.getLogger(__name__) - try: - # Strategy 1: Form-based - forms = await iframe.query_selector_all('form') - for form in forms: - text_inputs = await form.query_selector_all('input[type="text"], input[type="email"], input:not([type])') - pwd_inputs = await form.query_selector_all('input[type="password"]') - if text_inputs and pwd_inputs: - async def sel(inp): - ac = (await inp.get_attribute('autocomplete')) or '' - iid = (await inp.get_attribute('id')) or '' - nm = (await inp.get_attribute('name')) or '' - if ac: return f'input[autocomplete="{ac}"]' - if iid: return f'#{iid}' - if nm: return f'input[name="{nm}"]' - return 'input[type="text"], input[type="email"], input:not([type])' - return await sel(text_inputs[0]), await sel(pwd_inputs[0]) - - # Strategy 2: Proximity/attributes - password_fields = await iframe.query_selector_all('input[type="password"]') - for pwd in password_fields: - pwd_id = (await pwd.get_attribute('id')) or '' - pwd_name = (await pwd.get_attribute('name')) or '' - ac = (await pwd.get_attribute('autocomplete')) or '' - pwd_sel = 'input[autocomplete="current-password"]' if ac == 'current-password' else (f'#{pwd_id}' if pwd_id else (f'input[name="{pwd_name}"]' if pwd_name else 'input[type="password"]')) - for cand in [ - 'input[autocomplete="username"]', 'input[type="email"]', 'input[name*="login" i]', - 'input[id*="login" i]', 'input[name*="user" i]', 'input[id*="user" i]', - 'input[aria-label*="Login" i]', 'input[placeholder*="Login" i]', 'input[placeholder*="User" i]', 'input[type="text"]' - ]: - try: - if await iframe.is_visible(cand): - return cand, pwd_sel - except Exception: - pass - - # Strategy 3: Scoring - all_inputs = await iframe.query_selector_all('input') - username_candidates: List[Tuple[str, int]] = [] - password_candidates: List[str] = [] - for el in all_inputs: - input_type = (await el.get_attribute('type')) or '' - name = (await el.get_attribute('name')) or '' - iid = (await el.get_attribute('id')) or '' - placeholder = (await el.get_attribute('placeholder')) or '' - aria = (await el.get_attribute('aria-label')) or '' - ac = (await el.get_attribute('autocomplete')) or '' - if input_type.lower() in ['text', 'email', ''] and input_type.lower() != 'password': - score = 0 - text = f"{name} {iid} {placeholder} {aria}".lower() - for kw in ['login', 'user', 'email', 'username', 'id', 'account']: - if kw in text: score += 1 - if ac.lower() == 'username': score += 3 - is_vis = await iframe.is_visible(f'input[name="{name}"]' if name else (f'#{iid}' if iid else 'input')) - if is_vis: score += 2 - if score > 0: - selector = f'input[autocomplete="{ac}"]' if ac else (f'input[name="{name}"]' if name else (f'#{iid}' if iid else None)) - if selector: username_candidates.append((selector, score)) - if input_type.lower() == 'password': - is_vis = await iframe.is_visible(f'input[name="{name}"]' if name else (f'#{iid}' if iid else 'input[type="password"]')) - if is_vis: - selector = f'input[autocomplete="{ac}"]' if ac else (f'input[name="{name}"]' if name else (f'#{iid}' if iid else 'input[type="password"]')) - password_candidates.append(selector) - if username_candidates and password_candidates: - return max(username_candidates, key=lambda x: x[1])[0], password_candidates[0] - return None, None - except Exception as e: - logger.debug(f"Dynamic detection error: {e}") - return None, None - - -async def resolve_login_inner_frame(iframe_root): - """Some deployments nest the actual login form inside another iframe.""" - try: - try: - if await iframe_root.query_selector('input[type="password"]'): - return iframe_root - except Exception: - pass - child_iframes = await iframe_root.query_selector_all('iframe') - for child in child_iframes: - try: - sub = await child.content_frame() - if not sub: - continue - await sub.wait_for_load_state('domcontentloaded', timeout=5000) - if await sub.query_selector('input[type="password"]'): - return sub - except Exception: - continue - return iframe_root - except Exception: - return iframe_root - - -async def robust_iframe_wait(page, iframe_selector: str = '#lmsIframe', max_retries: int = 3, timeout: int = 30000): - """Robustly wait for login iframe with retries and multiple strategies.""" - logger = logging.getLogger(__name__) - for attempt in range(max_retries): - try: - try: - await page.wait_for_selector(iframe_selector, timeout=timeout // max_retries) - iframe_element = await page.wait_for_selector(iframe_selector) - iframe = await iframe_element.content_frame() - if iframe: - await iframe.wait_for_load_state('domcontentloaded', timeout=10000) - return iframe_element, iframe - except Exception: - pass - try: - iframes = await page.query_selector_all('iframe') - for iframe_elem in iframes: - iframe_id = await iframe_elem.get_attribute('id') - if 'lms' in (iframe_id or '').lower(): - iframe = await iframe_elem.content_frame() - if iframe: - await iframe.wait_for_load_state('domcontentloaded', timeout=5000) - return iframe_elem, iframe - except Exception: - pass - try: - iframe_elems = await page.query_selector_all('iframe') - for iframe_elem in iframe_elems: - src = await iframe_elem.get_attribute('src') or '' - if any(k in src.lower() for k in ['login', 'auth', 'signin']): - iframe = await iframe_elem.content_frame() - if iframe: - await iframe.wait_for_load_state('domcontentloaded', timeout=5000) - return iframe_elem, iframe - except Exception: - pass - if attempt < max_retries - 1: - await page.wait_for_timeout(2000) - except Exception: - if attempt < max_retries - 1: - await page.wait_for_timeout(2000) - logger.debug("Failed to find login iframe after all attempts") - return None, None diff --git a/schwab_scraper/browser/client.py b/schwab_scraper/browser/client.py deleted file mode 100644 index c77b9bf..0000000 --- a/schwab_scraper/browser/client.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Any -from playwright.async_api import async_playwright - - -async def connect(playwright_url: str): - p = await async_playwright().start() - browser = await p.chromium.connect(playwright_url) - return p, browser - - -async def new_context(browser, cookies: list[dict] | None = None, user_agent: str | None = None): - context = await browser.new_context( - user_agent=user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' - ) - if cookies: - valid_same_site_values = ['Strict', 'Lax', 'None'] - for cookie in cookies: - if cookie.get('sameSite') not in valid_same_site_values: - if cookie.get('sameSite') == 'no_restriction': - cookie['sameSite'] = 'None' - else: - cookie['sameSite'] = 'Lax' - await context.add_cookies(cookies) # type: ignore - return context - - -async def new_page(context): - return await context.new_page() - - diff --git a/schwab_scraper/browser/navigation.py b/schwab_scraper/browser/navigation.py deleted file mode 100644 index b0af046..0000000 --- a/schwab_scraper/browser/navigation.py +++ /dev/null @@ -1,38 +0,0 @@ -async def ensure_authenticated_page(page, context, debug: bool = False) -> bool: - if 'login' in page.url.lower() or 'sessiontimeout=y' in page.url.lower(): - if debug: - print("DEBUG: Detected session timeout, attempting re-authentication...") - from ..core.config import load_config, get_schwab_credentials # adjusted after refactor - from .auth import login_to_schwab - config = load_config() - username, password = get_schwab_credentials(config) - if username and password: - fresh_cookies = await login_to_schwab(username, password) - if fresh_cookies: - await context.clear_cookies() - await context.add_cookies(fresh_cookies) - if debug: - print("DEBUG: Re-authentication successful") - return True - else: - if debug: - print("DEBUG: Re-authentication failed") - return False - else: - if debug: - print("DEBUG: No credentials available for re-authentication") - return False - return True - - -async def goto_with_auth_check(page, context, url: str, debug: bool = False, timeout: int = 60000): - await page.goto(url, timeout=timeout) - await page.wait_for_load_state('domcontentloaded') - if not await ensure_authenticated_page(page, context, debug=debug): - return False - if 'login' in page.url.lower() or 'sessiontimeout=y' in page.url.lower(): - await page.goto(url, timeout=timeout) - await page.wait_for_load_state('domcontentloaded') - return True - - diff --git a/schwab_scraper/browser/session.py b/schwab_scraper/browser/session.py deleted file mode 100644 index cac04a5..0000000 --- a/schwab_scraper/browser/session.py +++ /dev/null @@ -1,470 +0,0 @@ -""" -Session management module for maintaining Schwab authenticated sessions. -This module provides functionality to refresh session state through browser navigation -without requiring 2FA approval for active sessions. -""" - -import json -import logging -import time -from typing import List, Dict, Any, Optional -from datetime import datetime - -from playwright.async_api import async_playwright -from ..core.config import load_config, get_playwright_url, get_cookies_path -from .client import new_context, new_page -from ..core import ErrorType, Envelope, fail, ok - - -async def refresh_session_state(cookies: Optional[List[Dict[str, Any]]] = None) -> bool: - """ - Refresh session state through browser navigation. - - This function maintains active sessions by navigating to a Schwab page, - which updates cookie expiration times and session state without requiring - 2FA approval for active sessions. - - Args: - cookies: Optional list of cookies to use. If None, loads from cookies.json - - Returns: - bool: True if session refresh was successful, False otherwise - """ - logger = logging.getLogger(__name__) - - try: - logger.info("Starting session refresh through navigation") - - # Load cookies if not provided - if cookies is None: - cookies_path = get_cookies_path() - try: - with open(cookies_path, 'r') as f: - cookies = json.load(f) - logger.info(f"Loaded {len(cookies) if cookies else 0} cookies from {cookies_path}") - except (FileNotFoundError, json.JSONDecodeError) as e: - logger.error(f"Could not load cookies: {e}") - return False - - if not cookies: - logger.error("No cookies available for session refresh") - return False - - config = load_config() - playwright_url = get_playwright_url(config) - - async with async_playwright() as p: - try: - browser = await p.chromium.connect(playwright_url) - except Exception as e: - logger.error(f"Failed to connect to browser: {e}") - return False - - try: - # Create context with existing cookies - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - # Navigate to refresh session state - logger.info("Navigating to Schwab research page to refresh session") - await page.goto("https://client.schwab.com/app/research/#/stocks/AAPL", timeout=30000) - await page.wait_for_timeout(2000) # Let page settle and cookies update - - # Check if navigation was successful (no redirect to login) - current_url = page.url - is_redirected = any(pattern in current_url for pattern in [ - '/login', '/signin', '/auth', '/Access/' - ]) - - if is_redirected: - logger.warning(f"Session refresh failed: redirected to login page") - logger.debug(f"Current URL: {current_url}") - await context.close() - await browser.close() - return False - - # Get updated cookies after navigation - new_cookies = await context.cookies() - logger.info(f"Retrieved {len(new_cookies)} cookies after navigation") - - # Check if we still have critical session cookies - critical_session_cookies = ['LVAL', 'NS2', 'sstate'] - missing_critical_cookies = [] - - for cookie_name in critical_session_cookies: - old_cookie = next((c for c in cookies if c['name'] == cookie_name), None) - new_cookie = next((c for c in new_cookies if c['name'] == cookie_name), None) - - if not new_cookie: - missing_critical_cookies.append(cookie_name) - elif old_cookie and new_cookie.get('expires') != -1: - # Session cookies should have expires = -1 - missing_critical_cookies.append(f"{cookie_name} (invalid session cookie)") - - if missing_critical_cookies: - logger.warning(f"Session refresh failed: missing critical session cookies: {missing_critical_cookies}") - await context.close() - await browser.close() - return False - - # Compare cookie states to detect changes - changes = [] - old_dict = {c['name']: c for c in cookies} - new_dict = {c['name']: c for c in new_cookies} - - # Check for modified cookies (especially expiration changes) - for name in old_dict: - if name in new_dict: - old_cookie = old_dict[name] - new_cookie = new_dict[name] - - # Check if expiration changed - old_expires = old_cookie.get('expires', -1) - new_expires = new_cookie.get('expires', -1) - if old_expires != new_expires: - changes.append({ - 'type': 'expiration_changed', - 'name': name, - 'old_expires': old_expires, - 'new_expires': new_expires - }) - - if changes: - logger.info(f"Detected {len(changes)} cookie changes (session refreshed)") - for change in changes[:3]: # Show first 3 - logger.debug(f" {change['name']}: expiration updated") - else: - logger.info("No cookie changes detected (session maintained)") - - # Save updated cookies - cookies_path = get_cookies_path() - with open(cookies_path, 'w') as f: - json.dump(new_cookies, f, indent=2) - logger.info(f"Saved {len(new_cookies)} updated cookies") - - await context.close() - await browser.close() - - return True - - except Exception as e: - logger.error(f"Error during session refresh: {e}") - try: - await context.close() - except: - pass - await browser.close() - return False - - except Exception as e: - logger.error(f"Session refresh failed: {e}") - return False - - -async def maintain_session_health() -> bool: - """ - Check if the current session is healthy by attempting a simple navigation. - - Returns: - bool: True if session is healthy, False if refresh is needed - """ - logger = logging.getLogger(__name__) - - try: - logger.info("Checking session health") - - # Load current cookies - cookies_path = get_cookies_path() - try: - with open(cookies_path, 'r') as f: - cookies = json.load(f) - except (FileNotFoundError, json.JSONDecodeError): - logger.error("No valid cookies found") - return False - - if not cookies: - logger.error("No cookies available") - return False - - # First, check if we have valid session cookies (basic check) - current_time = int(time.time()) - has_valid_session_cookies = False - - for cookie in cookies: - name = cookie.get('name', '') - expires = cookie.get('expires', -1) - - # Check for actual Schwab session cookies - if name in ['auth', 'ASP.NET_SessionId', 'SessionInfo', '__RequestVerificationToken']: - # Session cookies (expires=-1) are valid until browser closes - # Other cookies must not be expired - if expires == -1 or (expires and expires > current_time): - has_valid_session_cookies = True - break - - if not has_valid_session_cookies: - logger.warning("Session health check: FAILED - no valid session cookies found") - return False - - config = load_config() - playwright_url = get_playwright_url(config) - - async with async_playwright() as p: - browser = await p.chromium.connect(playwright_url) - - try: - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - # Navigate to a simple page to test session - await page.goto("https://client.schwab.com/app/research/#/stocks/AAPL", timeout=30000) - - # Check if we're still authenticated by URL pattern - current_url = page.url - logger.debug(f"Current URL after navigation: {current_url}") - - is_authenticated_by_url = any(pattern in current_url for pattern in [ - '/app/', '/Apps/', '/accounts/', '/Areas/Accounts', '/summary' - ]) - - # Check for login redirect patterns - is_redirected = any(pattern in current_url for pattern in [ - '/login', '/signin', '/auth', '/Access/' - ]) - - logger.debug(f"Authenticated by URL pattern: {is_authenticated_by_url}") - logger.debug(f"Redirected to login: {is_redirected}") - - # Primary check: If we're not redirected and have a good URL pattern, we're authenticated - if is_authenticated_by_url and not is_redirected: - logger.info("Session health check: PASSED - authenticated URL detected") - result = True - elif is_redirected: - logger.warning("Session health check: FAILED - redirect to login detected") - result = False - else: - # Secondary check: Look for any page content that indicates we're not on a login page - try: - # Check for login form elements - login_indicators = [ - 'input[type="password"]', - 'input[name*="login"]', - 'input[name*="user"]', - 'input[id*="login"]', - 'input[id*="user"]', - 'button:has-text("Log In")', - 'button:has-text("Sign In")' - ] - - login_found = False - for selector in login_indicators: - login_element = await page.query_selector(selector) - if login_element: - login_found = True - break - - if login_found: - logger.warning("Session health check: FAILED - login form detected") - result = False - else: - logger.info("Session health check: PASSED - no login form detected") - result = True - - except Exception as e: - logger.debug(f"Login form check error: {e}") - # If we can't check, assume healthy if we have valid cookies and no redirect - logger.info("Session health check: PASSED - based on cookies and URL") - result = True - - await context.close() - await browser.close() - - return result - - except Exception as e: - logger.error(f"Session health check error: {e}") - try: - await context.close() - except: - pass - await browser.close() - return False - - except Exception as e: - logger.error(f"Session health check failed: {e}") - return False - - -def get_session_info() -> Dict[str, Any]: - """ - Get information about the current session state. - - Returns: - Dict containing session information - """ - cookies_path = get_cookies_path() - try: - with open(cookies_path, 'r') as f: - cookies = json.load(f) - - session_cookies = [] - expiring_cookies = [] - current_time = datetime.now().timestamp() - - for cookie in cookies: - name = cookie.get('name', '') - expires = cookie.get('expires', -1) - - # Check if this is a session-related cookie - if any(keyword in name.lower() for keyword in ['session', 'auth', 'token']): - session_cookies.append({ - 'name': name, - 'domain': cookie.get('domain', ''), - 'expires': expires, - 'is_session_cookie': expires == -1 - }) - - if expires != -1 and expires > 0: - days_until_expire = (expires - current_time) / (24 * 3600) - if days_until_expire < 7: # Expiring within a week - expiring_cookies.append({ - 'name': name, - 'days_until_expire': days_until_expire - }) - - return { - 'total_cookies': len(cookies), - 'session_cookies': len(session_cookies), - 'expiring_cookies': len(expiring_cookies), - 'expiring_soon': expiring_cookies, - 'session_status': 'active' if session_cookies else 'no_session_cookies' - } - - except (FileNotFoundError, json.JSONDecodeError): - return { - 'error': 'No valid cookies found', - 'total_cookies': 0, - 'session_cookies': 0, - 'expiring_cookies': 0, - 'expiring_soon': [], - 'session_status': 'missing_cookies' - } - - -async def ensure_valid_session() -> bool: - """ - Ensure we have a valid session, attempting refresh if needed. - - Returns: - bool: True if a valid session exists or was successfully refreshed - """ - logger = logging.getLogger(__name__) - - # First check if we have any cookies - cookies_path = get_cookies_path() - try: - with open(cookies_path, 'r') as f: - cookies = json.load(f) - - if not cookies: - logger.error("No cookies available") - return False - - except (FileNotFoundError, json.JSONDecodeError): - logger.error("No valid cookies found") - return False - - # Check session health - if await maintain_session_health(): - logger.info("Session is healthy") - return True - - # Session needs refresh - logger.info("Session needs refresh, attempting navigation refresh") - return await refresh_session_state(cookies) - - -async def get_session_status(debug: bool = False) -> Envelope[dict]: - logger = logging.getLogger(__name__) - - try: - # First get basic cookie information - info = get_session_info() - - # If we have session cookies, validate they actually work with Schwab - if info.get('session_status') == 'active': - logger.debug("Session cookies found, validating with Schwab...") - - # Use maintain_session_health to actually test the session - is_healthy = await maintain_session_health() - - if not is_healthy: - # Update status to reflect that cookies exist but are invalid - info['session_status'] = 'invalid' - info['validation_error'] = 'Session cookies exist but Schwab authentication failed' - logger.warning("Session validation failed: cookies present but not accepted by Schwab") - else: - logger.debug("Session validation succeeded") - - logger.debug("Session status info: %s", info) - return ok(info) - except Exception as exc: - logger.exception("Failed to gather session status") - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - - -async def refresh_session(debug: bool = False) -> Envelope[None]: - logger = logging.getLogger(__name__) - - try: - refreshed = await refresh_session_state() - if refreshed: - logger.info("Session refresh succeeded") - return ok(None) - logger.warning("Session refresh failed") - return fail("Session refresh failed", ErrorType.AUTHENTICATION, retryable=True) - except Exception as exc: - logger.exception("Exception during session refresh") - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - - -async def set_cookies_from_file(path: str, debug: bool = False) -> Envelope[None]: - logger = logging.getLogger(__name__) - - try: - with open(path, "r") as fh: - cookies = json.load(fh) - - cookies_path = get_cookies_path() - with open(cookies_path, "w") as fh: - json.dump(cookies, fh, indent=2) - - logger.info("Imported %s cookies from %s", len(cookies), path) - return ok(None) - except (FileNotFoundError, json.JSONDecodeError) as exc: - logger.error("Failed to load cookies from %s: %s", path, exc) - return fail(str(exc), ErrorType.VALIDATION, retryable=False) - except Exception as exc: - logger.exception("Unexpected error importing cookies from %s", path) - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - - -async def export_cookies(path: str, debug: bool = False) -> Envelope[None]: - logger = logging.getLogger(__name__) - - cookies_path = get_cookies_path() - try: - with open(cookies_path, "r") as fh: - cookies = json.load(fh) - - with open(path, "w") as fh: - json.dump(cookies, fh, indent=2) - - logger.info("Exported %s cookies to %s", len(cookies), path) - return ok(None) - except (FileNotFoundError, json.JSONDecodeError) as exc: - logger.error("Failed to read cookies for export: %s", exc) - return fail(str(exc), ErrorType.AUTHENTICATION, retryable=False) - except Exception as exc: - logger.exception("Unexpected error exporting cookies to %s", path) - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) \ No newline at end of file diff --git a/schwab_scraper/cli.py b/schwab_scraper/cli.py deleted file mode 100644 index eced2e1..0000000 --- a/schwab_scraper/cli.py +++ /dev/null @@ -1,191 +0,0 @@ -import asyncio -import argparse -import json -import os -from dataclasses import asdict, is_dataclass -from typing import Any - -from . import unified_api -from .browser.auth import login_to_schwab -from .core.config import load_config, get_schwab_credentials, set_config_path, set_cookies_path - - -def _to_serializable(obj: Any) -> Any: - if is_dataclass(obj): - return asdict(obj) - if isinstance(obj, list): - return [_to_serializable(item) for item in obj] - if isinstance(obj, dict): - return {key: _to_serializable(value) for key, value in obj.items()} - return obj - - -def _print_envelope(envelope): - payload = dict(envelope) - payload["data"] = _to_serializable(payload.get("data")) - print(json.dumps(payload, indent=2, default=str)) - - -async def test_scraper(ticker: str, debug: bool): - """Test the get_morningstar_data function.""" - print(f"Running scraper test for ticker: {ticker}") - data = await unified_api.get_morningstar_data(ticker, debug=debug) - _print_envelope(data) - - -async def async_main(): - parser = argparse.ArgumentParser(description="Schwab Morningstar Scraper CLI") - parser.add_argument("ticker", nargs='?', help="Stock ticker to scrape") - parser.add_argument("--debug", action="store_true", help="Enable debug output") - parser.add_argument("--login", action="store_true", help="Login only (don't scrape)") - parser.add_argument("--test", action="store_true", help="Test mode") - parser.add_argument("--phase1", action="store_true", help="Extract Phase 1 enhanced equity data (quote, dividends, earnings, valuation ratios)") - - # Configuration file paths - parser.add_argument("--config-path", metavar="PATH", help="Custom path for config.json file") - parser.add_argument("--cookies-path", metavar="PATH", help="Custom path for cookies.json file") - - # Session commands - parser.add_argument("--session-status", action="store_true", help="Display current session status") - parser.add_argument("--export-cookies", metavar="PATH", help="Export cookies to file") - parser.add_argument("--set-cookies", metavar="PATH", help="Load cookies from file") - - # Transactions + accounts - parser.add_argument("--transactions", action="store_true", help="Export and parse transaction history") - parser.add_argument("--list-accounts", action="store_true", help="List available accounts") - - parser.add_argument("--account", help="Account identifier (ending digits like 604 or name like Joint)") - parser.add_argument("--start-date", help="Start date for custom range (YYYY-MM-DD)") - parser.add_argument("--end-date", help="End date for custom range (YYYY-MM-DD)") - parser.add_argument("--time-period", help="Preset period (e.g., 'Current Month', 'Last 6 Months')") - - # Accounts & positions - parser.add_argument("--account-overview", nargs='?', const="", help="Show balances for account or aggregate if omitted") - parser.add_argument("--positions", nargs='?', const="", help="Show positions for account or aggregate if omitted") - parser.add_argument("--portfolio-snapshot", nargs='?', const="", help="Show portfolio snapshot for account or aggregate if omitted") - parser.add_argument("--include-non-equity", action="store_true", help="Include non-equity positions") - parser.add_argument("--no-aggregate", action="store_true", help="Disable symbol aggregation in portfolio snapshot") - - args = parser.parse_args() - - # Apply custom path overrides if provided - if args.config_path: - if not os.path.exists(args.config_path): - print(f"Error: Config file not found: {args.config_path}") - return - set_config_path(args.config_path) - if args.cookies_path: - # Note: cookies.json may not exist yet (created on first login) - # so we don't validate existence, only that parent directory exists - cookies_dir = os.path.dirname(args.cookies_path) - if cookies_dir and not os.path.exists(cookies_dir): - print(f"Error: Directory for cookies file does not exist: {cookies_dir}") - return - set_cookies_path(args.cookies_path) - - if args.login: - # Set up debug logging when --debug is used - if args.debug: - import logging - logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s: %(message)s') - print("Debug logging enabled") - - config = load_config() - username, password = get_schwab_credentials(config) - if username and password: - print("Attempting to log in...") - if args.debug: - print(f"Using browserless server: {config.get('playwright', {}).get('url', 'default')}") - - cookies = await login_to_schwab(username, password) - if cookies: - print("Login successful and cookies saved.") - print(f"Saved {len(cookies)} cookies to cookies.json") - else: - print("Login failed.") - else: - print("Schwab username and password not found in config.json.") - return - - if args.session_status: - envelope = await unified_api.get_session_status(debug=args.debug) - _print_envelope(envelope) - return - - if args.set_cookies: - envelope = await unified_api.set_cookies(args.set_cookies, debug=args.debug) - _print_envelope(envelope) - return - - if args.export_cookies: - envelope = await unified_api.export_cookies(args.export_cookies, debug=args.debug) - _print_envelope(envelope) - return - - if args.list_accounts: - envelope = await unified_api.list_accounts(debug=args.debug) - _print_envelope(envelope) - return - - if args.account_overview is not None: - account_arg = args.account_overview or None - envelope = await unified_api.get_account_overview(account=account_arg, debug=args.debug) - _print_envelope(envelope) - return - - if args.positions is not None: - # If --positions has a value, use it. Otherwise, fall back to --account. - account_arg = args.positions if args.positions != "" else args.account - envelope = await unified_api.get_positions( - account=account_arg, - include_non_equity=args.include_non_equity, - debug=args.debug, - ) - _print_envelope(envelope) - return - - if args.portfolio_snapshot is not None: - account_arg = args.portfolio_snapshot or None - envelope = await unified_api.get_portfolio_snapshot( - account=account_arg, - aggregate_by_symbol=not args.no_aggregate, - include_non_equity=args.include_non_equity, - debug=args.debug, - ) - _print_envelope(envelope) - return - - if args.transactions: - envelope = await unified_api.get_transaction_history( - account=args.account, - start_date=args.start_date, - end_date=args.end_date, - time_period=args.time_period, - debug=args.debug, - ) - _print_envelope(envelope) - return - - if args.ticker: - if args.test: - await test_scraper(args.ticker, args.debug) - elif args.phase1: - print(f"Extracting Phase 1 enhanced equity data for {args.ticker}...") - envelope = await unified_api.get_equity_phase1_data(args.ticker, debug=args.debug) - _print_envelope(envelope) - else: - print(f"Scraping Morningstar data for {args.ticker}...") - envelope = await unified_api.get_morningstar_data(args.ticker, debug=args.debug) - _print_envelope(envelope) - return - - parser.print_help() - - -def main(): - """Entry point for console script""" - asyncio.run(async_main()) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/schwab_scraper/core/__init__.py b/schwab_scraper/core/__init__.py deleted file mode 100644 index f8fa2c2..0000000 --- a/schwab_scraper/core/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from .contracts import ( # noqa: F401 - Envelope, - ErrorType, - AccountOverview, - AccountSummary, - Lot, - MorningstarData, - PortfolioSnapshot, - Position, - SessionStatus, - Transaction, - # Phase 1 data structures - QuoteData, - EnhancedDividends, - EarningsData, - CalculatedMetrics, - EquityPhase1Data, - fail, - ok, -) diff --git a/schwab_scraper/core/config.py b/schwab_scraper/core/config.py deleted file mode 100644 index c2f9658..0000000 --- a/schwab_scraper/core/config.py +++ /dev/null @@ -1,134 +0,0 @@ -import json -import logging -import os -from typing import Optional - -# Module-level state for runtime path overrides -_config_path_override: Optional[str] = None -_cookies_path_override: Optional[str] = None - - -def set_config_path(path: Optional[str]) -> None: - """ - Set a custom path for config.json at runtime. - This override takes precedence over environment variables and defaults. - - Note: This uses module-level state and is not thread-safe. Suitable for - single-threaded CLI usage or single async operations. - - Args: - path: Absolute or relative path to config file, or None to reset - """ - global _config_path_override - _config_path_override = path - - -def set_cookies_path(path: Optional[str]) -> None: - """ - Set a custom path for cookies.json at runtime. - This override takes precedence over environment variables and defaults. - - Note: This uses module-level state and is not thread-safe. Suitable for - single-threaded CLI usage or single async operations. - - Args: - path: Absolute or relative path to cookies file, or None to reset - """ - global _cookies_path_override - _cookies_path_override = path - - -def get_config_path() -> str: - """ - Resolve the configuration file path using priority order: - 1. Runtime override (set_config_path) - 2. Environment variable SCHWAB_CONFIG_PATH - 3. Default locations (../config.json relative to module, then ./config.json) - - Returns: - str: Path to configuration file - """ - # Priority 1: Runtime override - if _config_path_override: - return _config_path_override - - # Priority 2: Environment variable - env_path = os.environ.get('SCHWAB_CONFIG_PATH') - if env_path: - return env_path - - # Priority 3: Default locations - # Try package root first (for development/installed package) - default_path = os.path.join(os.path.dirname(__file__), '..', 'config.json') - if os.path.exists(default_path): - return default_path - - # Fall back to current working directory - return 'config.json' - - -def get_cookies_path() -> str: - """ - Resolve the cookies file path using priority order: - 1. Runtime override (set_cookies_path) - 2. Environment variable SCHWAB_COOKIES_PATH - 3. Default location (./cookies.json in CWD) - - Returns: - str: Path to cookies file - """ - # Priority 1: Runtime override - if _cookies_path_override: - return _cookies_path_override - - # Priority 2: Environment variable - env_path = os.environ.get('SCHWAB_COOKIES_PATH') - if env_path: - return env_path - - # Priority 3: Default location - return 'cookies.json' - - -def load_config(): - """Load configuration from config.json (or custom path if configured)""" - logger = logging.getLogger(__name__) - config_path = get_config_path() - - try: - with open(config_path, 'r') as f: - return json.load(f) - except FileNotFoundError: - logger.error(f"config.json not found at {config_path}. Please create one based on config.json.sample") - return None - except json.JSONDecodeError: - logger.error(f"Invalid JSON in config file at {config_path}") - return None - - -def get_playwright_url(config=None): - """Get the Playwright browserless URL from config""" - import os - env_url = os.environ.get('SCHWAB_PLAYWRIGHT_URL') - if env_url: - return env_url - - if config is None: - config = load_config() - - if config and 'playwright' in config and 'url' in config['playwright']: - return config['playwright']['url'] - else: - # Default fallback URL - return "ws://browser.local.ben.io:3000/playwright/chromium" - - -def get_schwab_credentials(config=None): - """Get Schwab credentials from config""" - if config is None: - config = load_config() - - if config and 'schwab' in config: - return config['schwab'].get('username'), config['schwab'].get('password') - else: - return None, None \ No newline at end of file diff --git a/schwab_scraper/core/contracts.py b/schwab_scraper/core/contracts.py deleted file mode 100644 index 984941a..0000000 --- a/schwab_scraper/core/contracts.py +++ /dev/null @@ -1,271 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from datetime import datetime -from decimal import Decimal -from enum import Enum -from typing import Generic, Optional, TypeVar - -from typing_extensions import TypedDict - - -T = TypeVar("T") - - -class ErrorType(str, Enum): - """Categorisation for envelope failures.""" - - AUTHENTICATION = "AUTHENTICATION" - NETWORK = "NETWORK" - PARSING = "PARSING" - VALIDATION = "VALIDATION" - UNKNOWN = "UNKNOWN" - - -class Envelope(TypedDict, Generic[T]): - """Standard response envelope for unified API operations.""" - - success: bool - data: Optional[T] - error: Optional[str] - error_type: Optional[ErrorType] - retryable: bool - - -def ok(data: T) -> Envelope[T]: - """Create a success envelope containing the provided data.""" - - return { - "success": True, - "data": data, - "error": None, - "error_type": None, - "retryable": False, - } - - -def fail( - error: str, - error_type: ErrorType | str = ErrorType.UNKNOWN, - retryable: bool = False, -) -> Envelope[None]: - """Create a failure envelope with error metadata.""" - - resolved_error_type: ErrorType - if isinstance(error_type, ErrorType): - resolved_error_type = error_type - else: - try: - resolved_error_type = ErrorType(error_type) - except ValueError: - resolved_error_type = ErrorType.UNKNOWN - - return { - "success": False, - "data": None, - "error": error, - "error_type": resolved_error_type, - "retryable": retryable, - } - - -@dataclass(slots=True) -class SessionStatus: - """Represents the current authentication session state.""" - - logged_in: bool - session_age_minutes: Optional[int] = None - last_refresh: Optional[datetime] = None - needs_mfa: bool = False - cookies_valid: bool = True - - -@dataclass(slots=True) -class AccountSummary: - """Summary details for a Schwab account.""" - - id: str - label: str - type: str - last4: Optional[str] = None - is_margin: bool = False - - -@dataclass(slots=True) -class AccountOverview: - """Aggregated balance snapshot for an account.""" - - account: AccountSummary - total_value: Optional[Decimal] = None - day_change: Optional[Decimal] = None - day_change_pct: Optional[float] = None - cash: Optional[Decimal] = None - settled_cash: Optional[Decimal] = None - buying_power: Optional[Decimal] = None - margin_balance: Optional[Decimal] = None - - -@dataclass(slots=True) -class Lot: - """Individual lot information within a position.""" - - acquired_date: Optional[str] = None - quantity: Optional[float] = None - cost_basis: Optional[Decimal] = None - lot_id: Optional[str] = None - - -@dataclass(slots=True) -class Position: - """Holding data for a specific security.""" - - symbol: str - description: Optional[str] = None - asset_type: Optional[str] = None - quantity: Optional[float] = None - market_price: Optional[Decimal] = None - market_value: Optional[Decimal] = None - cost_basis_total: Optional[Decimal] = None - unrealized_gain: Optional[Decimal] = None - unrealized_gain_pct: Optional[float] = None - lots: list[Lot] = field(default_factory=list) - - -@dataclass(slots=True) -class PortfolioSnapshot: - """Aggregated view of equity holdings across accounts.""" - - equities: list[Position] - total_value: Optional[Decimal] = None - count: int = 0 - - -@dataclass(slots=True) -class MorningstarData: - """Unified Morningstar data payload (existing equity fields).""" - - ticker: str - company_name: Optional[str] = None - previous_dividend_payment: Optional[str] = None - previous_pay_date: Optional[str] = None - previous_ex_date: Optional[str] = None - frequency: Optional[str] = None - annual_dividend_rate: Optional[str] = None - annual_dividend_yield: Optional[str] = None - fair_value: Optional[str] = None - economic_moat: Optional[str] = None - capital_allocation: Optional[str] = None - rating: Optional[int] = None - one_star_price: Optional[str] = None - five_star_price: Optional[str] = None - assessment: Optional[str] = None - range_52_week: Optional[str] = None - dividend_yield: Optional[str] = None - investment_style: Optional[str] = None - report_url: Optional[str] = None - report_date: Optional[str] = None - source: Optional[str] = None - - -@dataclass(slots=True) -class Transaction: - """Normalized transaction record matching transactions feature.""" - - date: str - action: str - symbol: Optional[str] - description: str - quantity: Optional[str] - price: Optional[str] - fees_comm: Optional[str] - amount: Optional[str] - - -# Phase 1 Data Structures - -@dataclass(slots=True) -class QuoteData: - """Quote and price data from symbol bar.""" - - price: Optional[float] = None - change: Optional[float] = None - change_percent: Optional[float] = None - after_hours_price: Optional[float] = None - after_hours_change: Optional[float] = None - after_hours_change_percent: Optional[float] = None - bid: Optional[float] = None - ask: Optional[float] = None - bid_ask_size: Optional[str] = None - previous_close: Optional[float] = None - open: Optional[float] = None - volume: Optional[int] = None - volume_vs_avg: Optional[str] = None - day_range_low: Optional[float] = None - day_range_high: Optional[float] = None - week_52_low: Optional[float] = None - week_52_high: Optional[float] = None - market_cap: Optional[str] = None - sector: Optional[str] = None - exchange: Optional[str] = None - - -@dataclass(slots=True) -class EnhancedDividends: - """Enhanced dividend data including forward-looking information.""" - - # Forward-looking data (Phase 1) - next_payment: Optional[float] = None - next_pay_date: Optional[str] = None - next_ex_date: Optional[str] = None - - # Existing data - frequency: Optional[str] = None - annual_rate: Optional[float] = None - annual_yield: Optional[float] = None - previous_payment: Optional[float] = None - previous_pay_date: Optional[str] = None - previous_ex_date: Optional[str] = None - - -@dataclass(slots=True) -class EarningsData: - """Core earnings metrics and forecasts.""" - - # Upcoming earnings - next_announcement_date: Optional[str] = None - announcement_timing: Optional[str] = None - analysts_covering: Optional[int] = None - consensus_estimate: Optional[float] = None - estimate_high: Optional[float] = None - estimate_low: Optional[float] = None - - # Historical earnings - eps_ttm: Optional[float] = None - revenue_ttm: Optional[float] = None # Stored in dollars - pe_ttm: Optional[float] = None - forward_pe: Optional[float] = None - peg_ratio: Optional[float] = None - - # Beat/miss history (simplified for Phase 1) - recent_beats: list[dict] = field(default_factory=list) - future_estimates: list[dict] = field(default_factory=list) - - -@dataclass(slots=True) -class CalculatedMetrics: - """Calculated metrics derived from other data.""" - - payout_ratio: Optional[float] = None - - -@dataclass(slots=True) -class EquityPhase1Data: - """Complete Phase 1 enhanced equity data.""" - - ticker: str - quote: Optional[QuoteData] = None - dividends: Optional[EnhancedDividends] = None - earnings: Optional[EarningsData] = None - calculated_metrics: Optional[CalculatedMetrics] = None - - diff --git a/schwab_scraper/core/errors.py b/schwab_scraper/core/errors.py deleted file mode 100644 index 95f31ff..0000000 --- a/schwab_scraper/core/errors.py +++ /dev/null @@ -1,30 +0,0 @@ -class ScraperError(Exception): - """Base class for scraper-related errors.""" - - -class SessionExpiredError(ScraperError): - pass - - -class LoginError(ScraperError): - pass - - -class InvalidTickerError(ScraperError): - pass - - -class NoDataError(ScraperError): - pass - - -class DownloadError(ScraperError): - pass - - -class PdfParseError(ScraperError): - pass - - -class NavigationError(ScraperError): - pass diff --git a/schwab_scraper/core/models.py b/schwab_scraper/core/models.py deleted file mode 100644 index 29b26ce..0000000 --- a/schwab_scraper/core/models.py +++ /dev/null @@ -1,66 +0,0 @@ -from dataclasses import dataclass -from typing import Optional, List - -@dataclass -class DividendsData: - previous_payment: Optional[str] = None - previous_pay_date: Optional[str] = None - previous_ex_date: Optional[str] = None - frequency: Optional[str] = None - annual_dividend_rate: Optional[str] = None - annual_dividend_yield: Optional[str] = None - -@dataclass -class MorningstarPdfData: - fair_value: Optional[str] = None - economic_moat: Optional[str] = None - capital_allocation: Optional[str] = None - rating: Optional[int] = None - one_star_price: Optional[str] = None - five_star_price: Optional[str] = None - assessment: Optional[str] = None - range_52_week: Optional[str] = None - dividend_yield: Optional[str] = None - investment_style: Optional[str] = None - report_url: Optional[str] = None - report_date: Optional[str] = None - -@dataclass -class ScrapeResult: - ticker: str - company_name: Optional[str] - dividends: DividendsData - morningstar: MorningstarPdfData - source: str # "live" | "cache" - - -# -------------------- Transactions Feature -------------------- - -@dataclass -class AccountInfo: - account_type: str # e.g., "Joint", "IRA", "Individual" - account_ending: str # e.g., "604", "197", "873" - full_description: str # e.g., "Joint …604 (Account ending in 6 0 4)" - is_selected: bool = False - - -@dataclass -class TransactionRecord: - date: str - action: str - symbol: Optional[str] - description: str - quantity: Optional[str] - price: Optional[str] - fees_comm: Optional[str] - amount: Optional[str] - - -@dataclass -class TransactionData: - account_info: AccountInfo - transactions: List[TransactionRecord] - date_range: str - export_date: str - total_transactions: int - source: str # "live" | "cache" diff --git a/schwab_scraper/features/__init__.py b/schwab_scraper/features/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schwab_scraper/features/accounts_positions/__init__.py b/schwab_scraper/features/accounts_positions/__init__.py deleted file mode 100644 index 00a04e3..0000000 --- a/schwab_scraper/features/accounts_positions/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Unified accounts and positions feature package.""" - -from .accounts_scraper import list_accounts -from .overview_scraper import get_account_overview -from .positions_scraper import get_positions -from .portfolio_scraper import get_portfolio_snapshot - -__all__ = [ - "list_accounts", - "get_account_overview", - "get_positions", - "get_portfolio_snapshot", -] - diff --git a/schwab_scraper/features/accounts_positions/accounts_scraper.py b/schwab_scraper/features/accounts_positions/accounts_scraper.py deleted file mode 100644 index 1173447..0000000 --- a/schwab_scraper/features/accounts_positions/accounts_scraper.py +++ /dev/null @@ -1,153 +0,0 @@ -from __future__ import annotations - -import asyncio -import re -from typing import Optional - -from ...core import AccountSummary, Envelope, ErrorType, fail, ok -from ...browser.client import connect, new_context, new_page -from ...browser.navigation import goto_with_auth_check -from ...browser.auth import ensure_cookies -from ...core.config import get_playwright_url, load_config - -# Use the same URL as transactions feature for consistency and reliability -TRANSACTION_HISTORY_URL = "https://client.schwab.com/app/accounts/history/#/" - - -def _normalize_account_option(text: str, value: str) -> Optional[AccountSummary]: - text = text.strip() - if not text: - return None - - normalized_text = re.sub(r"\s+", " ", text) - - last4_match = re.search(r"(\d{3,4})", normalized_text.replace(" ", "")) - last4 = last4_match.group(1)[-4:] if last4_match else None - - type_match = re.search(r"^([A-Za-z&'\- ]+)", normalized_text) - account_type = (type_match.group(1).strip() if type_match else "Account").replace(" ", "_") - - account_id_candidates = [candidate for candidate in (value.strip(), last4, normalized_text) if candidate] - account_id = account_id_candidates[0] if account_id_candidates else normalized_text - - - label = normalized_text - is_margin = "margin" in normalized_text.lower() - - return AccountSummary( - id=account_id, - label=label, - type=account_type, - last4=last4, - is_margin=is_margin, - ) - - -async def list_accounts(debug: bool = False) -> Envelope[list[AccountSummary]]: - """ - Discover accounts from Schwab transaction history page. - - Uses the robust account discovery logic from the transactions feature - which handles multiple selector patterns and has enhanced reliability. - """ - cookies = await ensure_cookies() - if not cookies: - return fail("Unable to establish Schwab session.", ErrorType.AUTHENTICATION, retryable=False) - - config = load_config() - playwright_url = get_playwright_url(config) - - playwright = browser = context = page = None - try: - playwright, browser = await connect(playwright_url) - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - if not await goto_with_auth_check(page, context, TRANSACTION_HISTORY_URL, debug=debug): - return fail("Failed to load transaction history for account discovery.", ErrorType.AUTHENTICATION, retryable=True) - - # Allow page to fully load - await asyncio.sleep(2) - - # Use the robust account discovery from transactions feature - from ..transactions.scraper import discover_accounts_from_page - - discovered_accounts = await discover_accounts_from_page(page, debug=debug) - - if not discovered_accounts: - return fail("Account dropdown not found on transaction history page.", ErrorType.PARSING, retryable=True) - - # Convert discovered accounts to AccountSummary objects - accounts: list[AccountSummary] = [] - seen_ids: set[str] = set() - - for acc in discovered_accounts: - # Create AccountSummary from discovered account info - account_id = acc.get('ending', acc.get('label', '')) - - if account_id and account_id not in seen_ids: - summary = AccountSummary( - id=account_id, - label=acc.get('label', ''), - type=acc.get('type', 'Account'), - last4=acc.get('ending', ''), - is_margin=False, # Will be enhanced in future if needed - ) - accounts.append(summary) - seen_ids.add(account_id) - - if not accounts: - return fail("No accounts discovered from Schwab transaction history.", ErrorType.PARSING, retryable=True) - - if debug: - print(f"DEBUG: Successfully discovered {len(accounts)} accounts:") - for acc in accounts: - print(f"DEBUG: - {acc.label} (type: {acc.type}, last4: {acc.last4})") - - return ok(accounts) - except Exception as exc: - if debug: - print(f"DEBUG: Account discovery error: {exc}") - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - finally: - await _safe_close_page(page) - await _safe_close_context(context) - await _safe_close_browser(browser) - await _safe_stop_playwright(playwright) - - -async def _safe_close_page(page) -> None: - if page is None: - return - try: - await page.close() - except Exception: - pass - - -async def _safe_close_context(context) -> None: - if context is None: - return - try: - await context.close() - except Exception: - pass - - -async def _safe_close_browser(browser) -> None: - if browser is None: - return - try: - await browser.close() - except Exception: - pass - - -async def _safe_stop_playwright(playwright) -> None: - if playwright is None: - return - try: - await playwright.stop() - except Exception: - pass - diff --git a/schwab_scraper/features/accounts_positions/overview_scraper.py b/schwab_scraper/features/accounts_positions/overview_scraper.py deleted file mode 100644 index a708d23..0000000 --- a/schwab_scraper/features/accounts_positions/overview_scraper.py +++ /dev/null @@ -1,426 +0,0 @@ -from __future__ import annotations - -import asyncio -import re -from decimal import Decimal, InvalidOperation -from typing import Any, Optional, Sequence - -from ...browser.auth import ensure_cookies -from ...browser.client import connect, new_context, new_page -from ...browser.navigation import goto_with_auth_check -from ...core import AccountOverview, AccountSummary, Envelope, ErrorType, fail, ok -from ...core.config import get_playwright_url, load_config - -SUMMARY_URL = "https://client.schwab.com/accounts/summary/summary.aspx/" - - -def _parse_currency(value: str | None) -> Optional[Decimal]: - if not value: - return None - - cleaned = value.strip() - if not cleaned or cleaned in {"-", "--"}: - return None - - negative = False - if cleaned.startswith("(") and cleaned.endswith(")"): - negative = True - cleaned = cleaned.replace("$", "").replace(",", "") - cleaned = cleaned.replace("(", "").replace(")", "") - cleaned = cleaned.replace("−", "-").strip() - - if not cleaned: - return None - - try: - parsed = Decimal(cleaned) - if negative or parsed < 0: - parsed = -abs(parsed) - return parsed - except InvalidOperation: - return None - - -def _parse_percentage(value: str | None) -> Optional[float]: - if not value: - return None - cleaned = value.strip() - if not cleaned: - return None - - negative = False - if cleaned.startswith("(") and cleaned.endswith(")"): - negative = True - - cleaned = cleaned.replace("%", "").replace("(", "").replace(")", "") - cleaned = cleaned.replace("−", "-").strip() - - if not cleaned: - return None - - try: - parsed = float(cleaned) - except ValueError: - return None - - if negative or parsed < 0: - parsed = -abs(parsed) - return parsed - - -def _normalize_account_label(label: str) -> AccountSummary: - normalized = re.sub(r"\s+", " ", label).strip() - last4_match = re.search(r"(\d{3,4})\b", normalized.replace(" ", "")) - last4 = last4_match.group(1)[-4:] if last4_match else None - - type_match = re.search(r"^[A-Za-z&'\- ]+", normalized) - account_type = re.sub(r"\s+", "_", type_match.group(0).strip()) if type_match else "Account" - - account_id = f"{account_type}-{last4}" if last4 else account_type - - return AccountSummary( - id=account_id, - label=normalized, - type=account_type, - last4=last4, - is_margin="margin" in normalized.lower(), - ) - - -def _match_account(candidate: AccountSummary, requested: AccountSummary | str | None) -> bool: - if requested is None: - return True - if isinstance(requested, AccountSummary): - requested_values = { - requested.id.lower(), - requested.label.lower(), - } - if requested.last4: - requested_values.add(requested.last4.lower()) - else: - lookup = requested.strip().lower() - requested_values = {lookup} - - candidate_values = {candidate.id.lower(), candidate.label.lower()} - if candidate.last4: - candidate_values.add(candidate.last4.lower()) - - return bool(candidate_values & requested_values) - - -def _rows_to_dicts(headers: Sequence[str], rows: Sequence[Sequence[str]]) -> list[dict[str, str]]: - normalized_headers = [header.strip().lower() for header in headers] - results: list[dict[str, str]] = [] - for row in rows: - row_map: dict[str, str] = {} - for idx, header in enumerate(normalized_headers): - if idx < len(row): - row_map[header] = row[idx].strip() - results.append(row_map) - return results - - -async def _extract_table(page) -> dict[str, Any] | None: - return await page.evaluate( - """ - () => { - const wrapper = document.querySelector('.sdps-tables__wrapper'); - if (!wrapper) { - return null; - } - - const headerRow = wrapper.querySelector('.sdps-tables__row--header'); - const headers = headerRow - ? Array.from(headerRow.querySelectorAll('.sdps-tables__header-text')) - .map((el) => (el.textContent || '').trim()) - : []; - - if (!headers.length) { - const legacyHeaders = wrapper.querySelectorAll('thead th'); - if (legacyHeaders.length) { - for (const th of legacyHeaders) { - headers.push((th.textContent || '').trim()); - } - } - } - - const bodyRows = wrapper.querySelectorAll('.sdps-tables__row--body'); - const rows = []; - if (bodyRows.length) { - bodyRows.forEach((row) => { - const cells = Array.from( - row.querySelectorAll('.sdps-tables__cell, div[role="cell"], td') - ).map((cell) => (cell.textContent || '').trim()); - rows.push(cells); - }); - } - - if (!rows.length) { - const fallbackRows = wrapper.querySelectorAll('tbody tr'); - fallbackRows.forEach((row) => { - const cells = Array.from(row.querySelectorAll('td')).map((cell) => (cell.textContent || '').trim()); - if (cells.length) { - rows.push(cells); - } - }); - } - - return { headers, rows }; - } - """ - ) - - -async def _extract_totals(page) -> dict[str, str | None]: - return await page.evaluate( - r""" - () => { - const result = { total: null, dayChange: null, dayChangePct: null, cash: null }; - - const totalLabel = document.querySelector('#total-value-label'); - if (totalLabel) { - const valueEl = totalLabel.closest('[class*="sdps-panel"], h2, div'); - if (valueEl) { - const currencyMatch = valueEl.textContent?.match(/\$[\d,]+\.?\d*/); - if (currencyMatch) { - result.total = currencyMatch[0]; - } - } - } - - const dayChangeLabel = document.querySelector('#day-change-label'); - if (dayChangeLabel) { - const container = dayChangeLabel.parentElement; - if (container) { - const matchCurrency = container.textContent?.match(/\$[\d,]+\.?\d*/); - const matchPct = container.textContent?.match(/-?\d+(?:\.\d+)?%/); - if (matchCurrency) { - result.dayChange = matchCurrency[0]; - } - if (matchPct) { - result.dayChangePct = matchPct[0]; - } - } - } - - const cashLabel = Array.from(document.querySelectorAll('.sdps-tables__header-text')).find((el) => - el.textContent?.toLowerCase().includes('cash & cash investments') - ); - if (cashLabel) { - const container = cashLabel.closest('div'); - if (container) { - const matchCurrency = container.textContent?.match(/\$[\d,]+\.?\d*/); - if (matchCurrency) { - result.cash = matchCurrency[0]; - } - } - } - - return result; - } - """ - ) - - -def _row_to_overview(row_map: dict[str, str]) -> tuple[AccountSummary, AccountOverview]: - label = row_map.get('name') or row_map.get('account') or row_map.get('account name') or row_map.get('', '') - label = label or "Account" - - account_summary = _normalize_account_label(label) - - total_value = _parse_currency( - row_map.get('account value') - or row_map.get('total value') - or row_map.get('market value') - ) - - day_change = _parse_currency( - row_map.get('day change $') - or row_map.get('day change') - or row_map.get('day change amount') - ) - - day_change_pct = _parse_percentage( - row_map.get('day change %') - or row_map.get('day change percent') - ) - - cash_value = _parse_currency( - row_map.get('cash & cash investments') - or row_map.get('cash') - ) - - settled_cash = _parse_currency(row_map.get('settled cash')) - buying_power = _parse_currency(row_map.get('buying power') or row_map.get('available to trade')) - margin_balance = _parse_currency(row_map.get('margin balance') or row_map.get('margin')) - - overview = AccountOverview( - account=account_summary, - total_value=total_value, - day_change=day_change, - day_change_pct=day_change_pct, - cash=cash_value, - settled_cash=settled_cash, - buying_power=buying_power, - margin_balance=margin_balance, - ) - - return account_summary, overview - - -async def get_account_overview( - account: AccountSummary | str | None = None, *, debug: bool = False -) -> Envelope[AccountOverview]: - cookies = await ensure_cookies() - if not cookies: - return fail("Unable to establish Schwab session.", ErrorType.AUTHENTICATION, retryable=False) - - config = load_config() - playwright_url = get_playwright_url(config) - - playwright = browser = context = page = None - try: - playwright, browser = await connect(playwright_url) - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - if not await goto_with_auth_check(page, context, SUMMARY_URL, debug=debug): - return fail("Failed to load Schwab account summary page.", ErrorType.AUTHENTICATION, retryable=True) - - await asyncio.sleep(1) - - table_data = await _extract_table(page) - if not table_data: - return fail("Unable to locate account overview table.", ErrorType.PARSING, retryable=True) - - row_dicts = _rows_to_dicts(table_data["headers"], table_data["rows"]) - matched_overviews: list[AccountOverview] = [] - - for row_map in row_dicts: - # Skip empty rows or totals indicated by lack of numeric data - values = "".join(row_map.values()) - if not values: - continue - - summary, overview = _row_to_overview(row_map) - if _match_account(summary, account): - matched_overviews.append(overview) - - if not matched_overviews: - return fail("Account not found in overview table.", ErrorType.VALIDATION, retryable=False) - - if account is None and len(matched_overviews) > 1: - aggregated = _aggregate_overviews(matched_overviews) - totals = await _extract_totals(page) - if totals: - if totals.get("total"): - aggregated.total_value = _parse_currency(totals.get("total")) - if totals.get("dayChange"): - aggregated.day_change = _parse_currency(totals.get("dayChange")) - if totals.get("dayChangePct"): - aggregated.day_change_pct = _parse_percentage(totals.get("dayChangePct")) - if totals.get("cash"): - aggregated.cash = _parse_currency(totals.get("cash")) - return ok(aggregated) - - return ok(matched_overviews[0]) - except Exception as exc: - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - finally: - await _safe_close_page(page) - await _safe_close_context(context) - await _safe_close_browser(browser) - await _safe_stop_playwright(playwright) - - -def _aggregate_overviews(overviews: Sequence[AccountOverview]) -> AccountOverview: - total_value = Decimal("0") - day_change = Decimal("0") - cash_total = Decimal("0") - settled_total = Decimal("0") - buying_total = Decimal("0") - margin_total = Decimal("0") - - for item in overviews: - if item.total_value is not None: - total_value += item.total_value - if item.day_change is not None: - day_change += item.day_change - if item.cash is not None: - cash_total += item.cash - if item.settled_cash is not None: - settled_total += item.settled_cash - if item.buying_power is not None: - buying_total += item.buying_power - if item.margin_balance is not None: - margin_total += item.margin_balance - - aggregated_summary = AccountSummary( - id="AGGREGATE", - label="All Accounts", - type="AGGREGATE", - last4=None, - is_margin=False, - ) - - total_value_out = total_value if total_value != 0 else None - day_change_out = day_change if day_change != 0 else None - cash_out = cash_total if cash_total != 0 else None - settled_out = settled_total if settled_total != 0 else None - buying_out = buying_total if buying_total != 0 else None - margin_out = margin_total if margin_total != 0 else None - - day_change_pct: Optional[float] = None - if total_value_out and day_change_out: - try: - day_change_pct = float((day_change_out / total_value_out) * 100) - except (InvalidOperation, ZeroDivisionError): - day_change_pct = None - - return AccountOverview( - account=aggregated_summary, - total_value=total_value_out, - day_change=day_change_out, - day_change_pct=day_change_pct, - cash=cash_out, - settled_cash=settled_out, - buying_power=buying_out, - margin_balance=margin_out, - ) - - -async def _safe_close_page(page) -> None: - if page is None: - return - try: - await page.close() - except Exception: - pass - - -async def _safe_close_context(context) -> None: - if context is None: - return - try: - await context.close() - except Exception: - pass - - -async def _safe_close_browser(browser) -> None: - if browser is None: - return - try: - await browser.close() - except Exception: - pass - - -async def _safe_stop_playwright(playwright) -> None: - if playwright is None: - return - try: - await playwright.stop() - except Exception: - pass - diff --git a/schwab_scraper/features/accounts_positions/portfolio_scraper.py b/schwab_scraper/features/accounts_positions/portfolio_scraper.py deleted file mode 100644 index bffbec4..0000000 --- a/schwab_scraper/features/accounts_positions/portfolio_scraper.py +++ /dev/null @@ -1,134 +0,0 @@ -from __future__ import annotations - -from decimal import Decimal, InvalidOperation -from typing import Iterable, Optional - -from ...core import AccountSummary, Envelope, ErrorType, PortfolioSnapshot, Position, fail, ok -from .positions_scraper import get_positions - - -def _aggregate_positions(positions: Iterable[Position]) -> tuple[list[Position], Optional[Decimal]]: - aggregated: dict[str, Position] = {} - total_value = Decimal("0") - has_value = False - - for position in positions: - if position.market_value is not None: - total_value += position.market_value - has_value = True - - key = position.symbol.upper() if position.symbol else "UNKNOWN" - if key not in aggregated: - aggregated[key] = Position( - symbol=position.symbol, - description=position.description, - asset_type=position.asset_type, - quantity=position.quantity, - market_price=position.market_price, - market_value=position.market_value, - cost_basis_total=position.cost_basis_total, - unrealized_gain=position.unrealized_gain, - unrealized_gain_pct=position.unrealized_gain_pct, - lots=list(position.lots), - ) - continue - - existing = aggregated[key] - - if position.quantity is not None: - if existing.quantity is None: - existing.quantity = position.quantity - else: - existing.quantity += position.quantity - - if position.market_value is not None: - if existing.market_value is None: - existing.market_value = position.market_value - else: - existing.market_value += position.market_value - - if position.cost_basis_total is not None: - if existing.cost_basis_total is None: - existing.cost_basis_total = position.cost_basis_total - else: - existing.cost_basis_total += position.cost_basis_total - - if position.unrealized_gain is not None: - if existing.unrealized_gain is None: - existing.unrealized_gain = position.unrealized_gain - else: - existing.unrealized_gain += position.unrealized_gain - - if position.market_price is not None: - existing.market_price = position.market_price - - if position.unrealized_gain_pct is not None: - existing.unrealized_gain_pct = position.unrealized_gain_pct - - if position.description and not existing.description: - existing.description = position.description - - if position.asset_type: - existing.asset_type = position.asset_type - - if position.lots: - existing.lots.extend(position.lots) - - for item in aggregated.values(): - if item.unrealized_gain is not None and item.cost_basis_total not in (None, Decimal("0")): - try: - item.unrealized_gain_pct = float((item.unrealized_gain / item.cost_basis_total) * 100) - except (InvalidOperation, ZeroDivisionError): - item.unrealized_gain_pct = None - - total_value_out = total_value if has_value else None - return list(aggregated.values()), total_value_out - - -async def get_portfolio_snapshot( - account: AccountSummary | str | None = None, - *, - aggregate_by_symbol: bool = True, - include_non_equity: bool = False, - debug: bool = False, -) -> Envelope[PortfolioSnapshot]: - positions_envelope = await get_positions( - account=account, - include_non_equity=include_non_equity, - debug=debug, - ) - - if not positions_envelope["success"]: - return fail( - positions_envelope.get("error") or "Failed to retrieve positions.", - positions_envelope.get("error_type") or ErrorType.UNKNOWN, - positions_envelope.get("retryable", True), - ) - - positions = positions_envelope["data"] or [] - - if aggregate_by_symbol: - aggregated_positions, total_value = _aggregate_positions(positions) - count = len(aggregated_positions) - snapshot = PortfolioSnapshot( - equities=aggregated_positions, - total_value=total_value, - count=count, - ) - return ok(snapshot) - - total_value = Decimal("0") - has_value = False - for position in positions: - if position.market_value is not None: - total_value += position.market_value - has_value = True - - total_value_out = total_value if has_value else None - snapshot = PortfolioSnapshot( - equities=positions, - total_value=total_value_out, - count=len(positions), - ) - return ok(snapshot) - diff --git a/schwab_scraper/features/accounts_positions/positions_scraper.py b/schwab_scraper/features/accounts_positions/positions_scraper.py deleted file mode 100644 index 2e704ec..0000000 --- a/schwab_scraper/features/accounts_positions/positions_scraper.py +++ /dev/null @@ -1,648 +0,0 @@ -from __future__ import annotations - -import re -from decimal import Decimal, InvalidOperation -from typing import Any, Optional, Sequence - -from ...browser.auth import ensure_cookies -from ...browser.client import connect, new_context, new_page -from ...browser.navigation import goto_with_auth_check -from ...core import AccountSummary, Envelope, ErrorType, Lot, Position, fail, ok -from ...core.config import get_playwright_url, load_config -from ...utils.logging import save_debug_artifact - -POSITIONS_URL = "https://client.schwab.com/app/accounts/positions/#/" - - -def _parse_decimal(value: str | None) -> Optional[Decimal]: - if not value: - return None - - cleaned = value.strip() - if not cleaned or cleaned in {"-", "--"}: - return None - - negative = False - if cleaned.startswith("(") and cleaned.endswith(")"): - negative = True - - cleaned = ( - cleaned.replace("$", "") - .replace(",", "") - .replace("(", "") - .replace(")", "") - .replace("−", "-") - .replace("%", "") - .strip() - ) - - if not cleaned: - return None - - try: - parsed = Decimal(cleaned) - if negative or parsed < 0: - parsed = -abs(parsed) - return parsed - except InvalidOperation: - return None - - -def _parse_float(value: str | None) -> Optional[float]: - decimal_value = _parse_decimal(value) - if decimal_value is None: - return None - try: - return float(decimal_value) - except (ValueError, InvalidOperation): - return None - - -def _normalize_account_label(label: str) -> AccountSummary: - normalized = re.sub(r"\s+", " ", label).strip() - last4_match = re.search(r"(\d{3,4})\b", normalized.replace(" ", "")) - last4 = last4_match.group(1)[-4:] if last4_match else None - - type_match = re.search(r"^[A-Za-z&'\- ]+", normalized) - account_type = re.sub(r"\s+", "_", type_match.group(0).strip()) if type_match else "Account" - - account_id = f"{account_type}-{last4}" if last4 else account_type - - return AccountSummary( - id=account_id, - label=normalized, - type=account_type, - last4=last4, - is_margin="margin" in normalized.lower(), - ) - - -def _match_account(candidate: AccountSummary, requested: AccountSummary | str | None) -> bool: - if requested is None: - return True - - if isinstance(requested, AccountSummary): - requested_values = { - requested.id.lower(), - requested.label.lower(), - } - if requested.last4: - requested_values.add(requested.last4.lower()) - else: - lookup = requested.strip().lower() - requested_values = {lookup} - - candidate_values = {candidate.id.lower(), candidate.label.lower()} - if candidate.last4: - candidate_values.add(candidate.last4.lower()) - - return bool(candidate_values & requested_values) - - -def classify_asset(symbol: str | None, description: str | None) -> str: - if symbol: - sym = symbol.strip().upper() - else: - sym = "" - desc = (description or "").strip().upper() - - if sym and re.fullmatch(r"[A-Z]{1,5}", sym): - if "ETF" in desc: - return "ETF" - if any(kw in desc for kw in ["FUND", "MUTUAL"]): - return "MUTUAL_FUND" - return "EQUITY" - - if sym and re.search(r"\d", sym) and len(sym) > 5: - return "OPTION" - - if any(kw in desc for kw in ["BOND", "CD", "TREASURY"]): - return "BOND" - - if sym in {"CASH", "MMDA", "SWEEP"} or "CASH" in desc: - return "CASH" - - if "ETF" in desc: - return "ETF" - if "FUND" in desc: - return "MUTUAL_FUND" - - return "OTHER" - - -async def _evaluate_table(page) -> dict[str, Any] | None: - return await page.evaluate( - """ - () => { - const table = document.querySelector('#positionsDetails'); - if (!table) { - return null; - } - - const headers = Array.from(table.querySelectorAll('thead tr th')).map((th) => { - const btn = th.querySelector('button, .sdps-tables__header-text'); - if (btn) { - return (btn.innerText || btn.textContent || '').trim(); - } - return (th.innerText || th.textContent || '').trim(); - }); - - const rowElements = Array.from(table.querySelectorAll('tbody tr')); - const rows = []; - let current = null; - let currentAccount = null; - - const isLotRow = (row) => { - const klass = (row.className || '').toLowerCase(); - const tagName = (row.tagName || '').toLowerCase(); - return klass.includes('lot') || klass.includes('child') || tagName.includes('app-lot'); - }; - - const isPositionRow = (row) => { - const klass = (row.className || '').toLowerCase(); - return klass.includes('position-row'); - }; - - const isAccountHeader = (row) => { - const klass = (row.className || '').toLowerCase(); - const text = (row.textContent || '').trim(); - return !klass.includes('position-row') && - (klass.includes('highlight-row') || klass.includes('border-top-dark')) && - text.includes('account panel'); - }; - - for (const row of rowElements) { - // Check if this is an account header row - if (isAccountHeader(row)) { - const text = row.textContent.trim(); - // Extract account name from account panel text - const match = text.match(/account panel[\\s\\n]+([^\\n]+)/); - if (match) { - currentAccount = match[1].trim(); - } - continue; - } - - const cells = Array.from(row.querySelectorAll('td')).map((cell) => { - // 1. Try to find a title attribute on a span (often has more precise value) - const titledSpan = cell.querySelector('span[title]'); - if (titledSpan && titledSpan.getAttribute('title').trim().length > 0) { - const title = titledSpan.getAttribute('title').trim(); - if (title.includes('$') || /^[+-]?[\\d,.]+$/.test(title) || title.includes('%')) { - return title; - } - } - - // 2. Try to find text directly or within common button/link wrappers - const btn = cell.querySelector('button, a, .sdps-button'); - if (btn) { - // Check button title too - if (btn.hasAttribute('title') && btn.getAttribute('title').trim().length > 0) { - return btn.getAttribute('title').trim(); - } - // Ignore some internal elements like superscripts if present - const clone = btn.cloneNode(true); - clone.querySelectorAll('sup, .sdps-sr-only').forEach(el => el.remove()); - return (clone.innerText || clone.textContent || '').trim(); - } - - // 3. Just clean up the cell text - const clone = cell.cloneNode(true); - clone.querySelectorAll('sup, .sdps-sr-only').forEach(el => el.remove()); - return (clone.innerText || clone.textContent || '').trim(); - }); - - if (!cells.length) { - continue; - } - - if (isLotRow(row)) { - if (current) { - // For lots, we typically skip the first two columns (empty/checkbox) - current.lots.push(cells.slice(2)); - } - } else if (isPositionRow(row)) { - // Extract symbol from data-symbol attribute - const symbol = row.getAttribute('data-symbol') || ''; - current = { - type: 'position', - cells: cells, - lots: [], - symbol: symbol, - account: currentAccount - }; - rows.push(current); - } - } - - return { headers, rows }; - } - """ - ) - - -def _map_row(headers: Sequence[str], cells: Sequence[str]) -> dict[str, str]: - result: dict[str, str] = {} - - # Filter out empty headers to get the list of "real" data columns - data_headers = [] - for h in headers: - # Replace non-breaking spaces and other special whitespace with regular spaces - h_clean = h.replace('\u00a0', ' ').replace('\u200b', '').strip() - name = h_clean.split('\n')[0].strip().lower() - if name: - data_headers.append(name) - else: - data_headers.append(f"empty_{len(data_headers)}") - - # We skip headers that definitely don't have cells (checkbox, symbol is usually in data-symbol) - # Looking at debug output, 'description' is the first cell. - # So we find where 'description' or 'name' is in our data_headers. - - start_idx = -1 - for i, h in enumerate(data_headers): - if h in {'description', 'name'}: - start_idx = i - break - - if start_idx == -1: - # Fallback to simple index mapping if we can't find description - for i, cell in enumerate(cells): - key = data_headers[i] if i < len(data_headers) else f"column_{i}" - result[key] = cell - return result - - # Map cells starting from description - for i, cell in enumerate(cells): - header_idx = start_idx + i - if header_idx < len(data_headers): - key = data_headers[header_idx] - result[key] = cell - else: - result[f"extra_{i}"] = cell - - return result - - -def _parse_lots(lot_rows: Sequence[Sequence[str]]) -> list[Lot]: - lots: list[Lot] = [] - for cells in lot_rows: - if not cells: - continue - - # New modal table columns: - # 0: Open Date, 1: Quantity, 2: Price, 3: Cost/Share, 4: Market Value, 5: Cost Basis, ... - acquired_date = cells[0].strip() if len(cells) > 0 else None - quantity = _parse_float(cells[1] if len(cells) > 1 else None) - # In the modal table, index 5 is Cost Basis. index 3 is Cost/Share. - # Position-level Lot contract has 'cost_basis' field which typically means total cost. - cost_basis = _parse_decimal(cells[5] if len(cells) > 5 else None) - # lot_id isn't explicitly in the table, we'll use holding period or empty - lot_id = cells[8].strip() if len(cells) > 8 else None - - lots.append( - Lot( - acquired_date=acquired_date or None, - quantity=quantity or 0.0, - cost_basis=cost_basis, - lot_id=lot_id or None, - ) - ) - return lots - - -def _row_to_position(row_map: dict[str, str], lots_rows: Sequence[Sequence[str]], symbol: str = "") -> Position: - # Symbol is now passed from data-symbol attribute on row - # Description is in the first visible cell - description = row_map.get('description') or row_map.get('name') or row_map.get('column_1') or "" - - # Price is typically in column labeled 'price' or similar - # From debug info: 'price chng $' is next, but market price was likely mapped earlier or skipped - # Actually 'price' was likely one of the empty headers that didn't have a button? - # No, debug info shows: [..., 'Qty', '', 'Price Chng $', ...] - # And cells: [..., '2,944.633', 'TITLE:04/24/2026', 'TITLE:+1.13%', ...] - # 'TITLE:04/24/2026' corresponds to the empty header between Qty and Price Chng $ - # That title contains the date, but the cell text is usually the price. - - market_price = _parse_decimal( - row_map.get('price') - or row_map.get('market price') - or row_map.get('last price') - or row_map.get('empty_4') - or row_map.get('empty_5') - ) - - # Quantity - quantity = _parse_float(row_map.get('qty') or row_map.get('quantity')) - market_value = _parse_decimal(row_map.get('mkt val') or row_map.get('market value')) - - # Cost basis mapping - cost_basis_total = _parse_decimal( - row_map.get('cost basis') - or row_map.get('total cost') - ) - - unrealized_gain = _parse_decimal( - row_map.get('gain/loss $') - or row_map.get('unrealized gain') - or row_map.get('empty_11') # Adjusted index - ) - unrealized_gain_pct = _parse_float( - row_map.get('gain/loss %') - or row_map.get('unrealized gain %') - or row_map.get('empty_12') - ) - - asset_type = classify_asset(symbol, description) - - lots = _parse_lots(lots_rows) - - return Position( - symbol=symbol or "", - description=description or None, - asset_type=asset_type, - quantity=quantity, - market_price=market_price, - market_value=market_value, - cost_basis_total=cost_basis_total, - unrealized_gain=unrealized_gain, - unrealized_gain_pct=unrealized_gain_pct, - lots=lots, - ) - - -async def get_positions( - account: AccountSummary | str | None = None, - *, - include_non_equity: bool = False, - debug: bool = False, -) -> Envelope[list[Position]]: - cookies = await ensure_cookies() - if not cookies: - return fail("Unable to establish Schwab session.", ErrorType.AUTHENTICATION, retryable=False) - - config = load_config() - playwright_url = get_playwright_url(config) - - playwright = browser = context = page = None - try: - playwright, browser = await connect(playwright_url) - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - if not await goto_with_auth_check(page, context, POSITIONS_URL, debug=debug): - return fail("Failed to load Schwab positions page.", ErrorType.AUTHENTICATION, retryable=True) - - if account: - requested_id = account.id if isinstance(account, AccountSummary) else account - if debug: - print(f"DEBUG: Attempting to switch to account: {requested_id} via Summary page") - - # Go to summary page to switch (much more stable than dropdown) - await goto_with_auth_check(page, context, "https://client.schwab.com/app/accounts/summary/#/", debug=debug) - await page.wait_for_timeout(3000) - - # Find and click the account row link - clicked = await page.evaluate(""" - (query) => { - const rows = Array.from(document.querySelectorAll('sdps-table-row, tr')); - const targetRow = rows.find(r => r.innerText.includes(query) || r.textContent.includes(query)); - if (targetRow) { - const link = targetRow.querySelector('a.acctNavigate-button-link'); - if (link) { - link.click(); - return true; - } - } - return false; - } - """, requested_id) - - if clicked: - if debug: - print(f"DEBUG: Clicked account {requested_id} on summary page") - await page.wait_for_timeout(5000) - else: - if debug: - print(f"DEBUG: Failed to find account {requested_id} on summary page, trying dropdown as fallback...") - from ..transactions.scraper import switch_account_on_page - await switch_account_on_page(page, requested_id, context=context, debug=debug) - - # Ensure we are on positions page for the selected account - if "/accounts/positions" not in page.url: - await goto_with_auth_check(page, context, POSITIONS_URL, debug=debug) - - if debug: - html = await page.content() - save_debug_artifact("positions_page_initial.html", html) - - await page.wait_for_selector('#positionsDetails', timeout=45000) - await page.wait_for_timeout(1000) - - await page.evaluate('window.scrollTo(0, document.body.scrollHeight)') - await page.wait_for_timeout(1500) - - if debug: - html = await page.content() - save_debug_artifact("positions_page_scrolled.html", html) - png = await page.screenshot(full_page=True) - save_debug_artifact("positions_page.png", png) - - # 1. Get headers once - headers = await page.evaluate(""" - () => { - const table = document.querySelector('#positionsDetails'); - if (!table) return []; - return Array.from(table.querySelectorAll('thead tr th')).map(th => { - const btn = th.querySelector('button, .sdps-tables__header-text'); - const text = (btn ? (btn.innerText || btn.textContent) : (th.innerText || th.textContent)) || ''; - return text.trim().replace(/\\u00a0/g, ' ').replace(/\\u200b/g, '').split('\\n')[0].trim().toLowerCase(); - }); - } - """) - if not headers: - return fail("Positions table headers not found.", ErrorType.PARSING, retryable=True) - - # 1.5 Pre-cleanup: Close any accidentally opened modals - try: - open_modals = await page.query_selector_all('app-lot sdps-modal[sdps-id="open-lot-overlay"].sdps-modal--open') - for m in open_modals: - close = await m.query_selector('button.sdps-modal__close') - if close: - await close.click(force=True) - await page.wait_for_timeout(500) - except Exception: - pass - - # 2. Get all position rows metadata first to avoid stale handle issues - position_metadata = await page.evaluate(""" - () => { - const rows = Array.from(document.querySelectorAll('tr.position-row')); - return rows.map((row, index) => { - const symbol = row.getAttribute('data-symbol') || ''; - const cells = Array.from(row.querySelectorAll('td')).map((cell) => { - const btn = cell.querySelector('button, a, .sdps-button'); - if (btn) { - const clone = btn.cloneNode(true); - clone.querySelectorAll('sup, .sdps-sr-only').forEach(el => el.remove()); - let txt = clone.innerText.trim(); - if (!txt && btn.hasAttribute('title')) txt = btn.getAttribute('title').trim(); - return txt; - } - const titledSpan = cell.querySelector('span[title]'); - const clone = cell.cloneNode(true); - clone.querySelectorAll('sup, .sdps-sr-only').forEach(el => el.remove()); - let txt = clone.innerText.trim(); - // If no direct text but has a title with a number, use that - if (!txt && titledSpan && titledSpan.getAttribute('title')) { - const t = titledSpan.getAttribute('title').trim(); - if (t.includes('$') || /^[+-]?[\\d,.]+$/.test(t)) return t; - } - return txt; - }); - return { symbol, cells, index }; - }); - } - """) - - if debug: - print(f"Found {len(position_metadata)} positions to process") - - all_positions: list[Position] = [] - for meta in position_metadata: - symbol = meta['symbol'] - idx = meta['index'] - - # Re-fetch row for lot expansion if needed - lots_data = [] - try: - rows = await page.query_selector_all('tr.position-row') - if idx < len(rows): - row = rows[idx] - expander = await row.query_selector('sdps-button[sdps-id="costBasisTBD"] button') - if expander: - await expander.scroll_into_view_if_needed() - # Use force=True because sometimes modals/overlays block the click in Schwab's UI - await expander.click(force=True) - - # Wait for modal to appear - await page.wait_for_timeout(1000) - - # Find the active modal (not inert, visible, and matches our symbol) - modal_handle = None - modals = await page.query_selector_all('app-lot sdps-modal[sdps-id="open-lot-overlay"]') - for m in modals: - is_hidden = await m.evaluate('el => el.getAttribute("aria-hidden") === "true" || el.hasAttribute("inert")') - if is_hidden: - continue - - # Verify title matches symbol to avoid stale modal data - title_text = await m.evaluate('el => el.querySelector(".sdps-modal__title")?.innerText || ""') - if symbol.upper() in title_text.upper(): - modal_handle = m - break - - if modal_handle: - modal_id = await modal_handle.get_attribute('modal-id') - if debug: - print(f"Processing modal {modal_id} for {symbol}") - - # Wait for table to be populated - try: - await modal_handle.wait_for_selector('#responsiveLotTable tbody tr.data-row', timeout=3000) - except Exception: - pass - - # Extract lots from this specific modal - lots_data = await page.evaluate(f""" - (mId) => {{ - const modal = document.querySelector(`app-lot sdps-modal[modal-id="${{mId}}"]`); - if (!modal) return []; - const lotTable = modal.querySelector('#responsiveLotTable'); - if (!lotTable) return []; - const lotRows = Array.from(lotTable.querySelectorAll('tbody tr.data-row')); - return lotRows.map(r => {{ - return Array.from(r.querySelectorAll('th, td')).map(c => {{ - const clone = c.cloneNode(true); - clone.querySelectorAll('sup, .sdps-sr-only, .transactionCostColor').forEach(el => el.remove()); - return clone.innerText.trim(); - }}); - }}); - }} - """, modal_id) - - # Close this specific modal - close_btn = await modal_handle.query_selector('button.sdps-modal__close') - if close_btn: - await close_btn.click(force=True) - # Wait for modal to actually be removed or hidden - try: - await page.wait_for_selector(f'app-lot sdps-modal[modal-id="{modal_id}"]', state='hidden', timeout=3000) - except Exception: - pass - else: - if debug: - print(f"DEBUG: Could not find matching visible modal for {symbol}") - except Exception as e: - if debug: - print(f"Error expanding lots for {symbol}: {e}") - - row_map = _map_row(headers, meta['cells']) - position = _row_to_position(row_map, lots_data, symbol=symbol) - - if not include_non_equity and position.asset_type not in {"EQUITY", "ETF"}: - continue - - all_positions.append(position) - - if not all_positions: - return fail("No positions matched the requested criteria.", ErrorType.VALIDATION, retryable=False) - - return ok(all_positions) - except Exception as exc: - if debug: - import traceback - traceback.print_exc() - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - finally: - await _safe_close_page(page) - await _safe_close_context(context) - await _safe_close_browser(browser) - await _safe_stop_playwright(playwright) - - -async def _safe_close_page(page) -> None: - if page is None: - return - try: - await page.close() - except Exception: - pass - - -async def _safe_close_context(context) -> None: - if context is None: - return - try: - await context.close() - except Exception: - pass - - -async def _safe_close_browser(browser) -> None: - if browser is None: - return - try: - await browser.close() - except Exception: - pass - - -async def _safe_stop_playwright(playwright) -> None: - if playwright is None: - return - try: - await playwright.stop() - except Exception: - pass - diff --git a/schwab_scraper/features/equity/__init__.py b/schwab_scraper/features/equity/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schwab_scraper/features/equity/morningstar.py b/schwab_scraper/features/equity/morningstar.py deleted file mode 100644 index 989d703..0000000 --- a/schwab_scraper/features/equity/morningstar.py +++ /dev/null @@ -1,239 +0,0 @@ -from typing import Optional, Tuple -import logging - - -async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]: - """Locate the Morningstar Equity Report link and date on the stock page. - - Uses multiple fallback strategies to handle Schwab website changes. - - Returns: - Tuple of (url, date) where: - - url: The href attribute if it's a traditional link, or a special marker - '__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking - - date: The report date string if found - """ - logger = logging.getLogger(__name__) - - # Strategy 1: Original selector - report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link" - if await page.is_visible(report_link_selector): - if debug: - logger.debug("Found Morningstar report using original selector") - report_link_element = page.locator(report_link_selector) - await report_link_element.scroll_into_view_if_needed() - url = await report_link_element.get_attribute("href") - - # Date element (escaped spaces) - date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)") - date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None - - # Check if href is empty (modern web component using blob URLs) - if not url or url == '': - if debug: - logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click") - # Return a special marker to indicate we need to click the link to get the URL - return '__CLICK_TO_OPEN__', date_text - - return url, date_text - - # Strategy 2: Look for any link containing "morningstar" in research section - if debug: - logger.debug("Original selector failed, trying fallback selectors...") - - fallback_selectors = [ - "a.sr-report-link[href*='morningstar']", - "a[href*='morningstar'][href*='pdf']", - "#morningstar-section a.sr-report-link", - "div[id*='Morningstar'] a", - ] - - for selector in fallback_selectors: - try: - if await page.is_visible(selector, timeout=2000): - if debug: - logger.debug(f"Found Morningstar report using fallback selector: {selector}") - report_link_element = page.locator(selector).first - await report_link_element.scroll_into_view_if_needed() - url = await report_link_element.get_attribute("href") - - # Try to find date with various selectors - date_text = None - date_selectors = [ - r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)", - "sdps-date-time time span", - "time span", - ] - for date_sel in date_selectors: - try: - date_locator = page.locator(date_sel) - if await date_locator.count() > 0: - date_text = (await date_locator.first.inner_text()).strip() - if date_text: - break - except: - continue - - return url, date_text - except Exception as e: - if debug: - logger.debug(f"Fallback selector {selector} failed: {e}") - continue - - # Strategy 3: Use JavaScript to search for Morningstar links - if debug: - logger.debug("All CSS selectors failed, trying JavaScript search...") - - try: - result = await page.evaluate(""" - () => { - // Look for any link containing 'morningstar' and 'pdf' - const links = Array.from(document.querySelectorAll('a[href]')); - const morningstarLink = links.find(link => - link.href.toLowerCase().includes('morningstar') && - link.href.toLowerCase().includes('pdf') - ); - - if (morningstarLink) { - // Try to find associated date - let dateText = null; - const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement; - if (parent) { - const timeElement = parent.querySelector('time'); - if (timeElement) { - dateText = timeElement.textContent.trim(); - } - } - - return { - url: morningstarLink.href, - date: dateText - }; - } - - return null; - } - """) - - if result and result.get('url'): - if debug: - logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}") - return result['url'], result.get('date') - except Exception as e: - if debug: - logger.debug(f"JavaScript search failed: {e}") - - # No report found - if debug: - logger.debug("No Morningstar report link found using any strategy") - # Capture page state for debugging - try: - await page.screenshot(path="debug_morningstar_not_found.png", full_page=True) - logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png") - - # Log available elements for debugging - page_info = await page.evaluate(""" - () => { - return { - hasMorningstarSection: !!document.querySelector('#morningstar-section'), - hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'), - morningstarLinks: Array.from(document.querySelectorAll('a[href]')) - .filter(a => a.href.toLowerCase().includes('morningstar')) - .length, - allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length - } - } - """) - logger.debug(f"Page state: {page_info}") - except Exception as e: - logger.debug(f"Failed to capture debug info: {e}") - - return None, None - - -async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]: - """Open the PDF in a new page and return bytes via data URL conversion. - - Args: - page: The current Playwright page - url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs - debug: Enable debug logging - - Returns: - PDF bytes if successful, None otherwise - """ - logger = logging.getLogger(__name__) - - if not url: - return None - - # Handle blob URL case (modern web component) - if url == '__CLICK_TO_OPEN__': - if debug: - logger.debug("Handling blob URL - clicking link to open PDF") - - # Click the Morningstar report link to open the PDF - report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link" - - try: - # Wait for new page to open after clicking - new_page_promise = page.context.wait_for_event("page", timeout=15000) - await page.click(report_link_selector) - new_page = await new_page_promise - - if debug: - logger.debug(f"New page opened with URL: {new_page.url}") - - # Wait for PDF to load - await new_page.wait_for_load_state('load', timeout=10000) - - # The PDF is now loaded as a blob URL - extract it - blob_url = new_page.url - - except Exception as e: - if debug: - logger.debug(f"Error clicking link to open PDF: {e}") - return None - else: - # Traditional URL case - if debug: - logger.debug(f"Opening PDF from traditional URL: {url}") - - new_page_promise = page.context.wait_for_event("page") - await page.evaluate("url => window.open(url, '_blank')", url) - new_page = await new_page_promise - await new_page.wait_for_load_state('load') - blob_url = url - - # Fetch and convert to Base64 in browser context - try: - pdf_base64 = await new_page.evaluate( - """ - async (url) => { - const response = await fetch(url); - const blob = await response.blob(); - return await new Promise((resolve) => { - const reader = new FileReader(); - reader.onloadend = () => resolve(reader.result.split(',')[1]); - reader.readAsDataURL(blob); - }); - } - """, - blob_url, - ) - await new_page.close() - - if not pdf_base64: - return None - - import base64 - return base64.b64decode(pdf_base64) - - except Exception as e: - if debug: - logger.debug(f"Error extracting PDF bytes: {e}") - try: - await new_page.close() - except: - pass - return None diff --git a/schwab_scraper/features/equity/parser.py b/schwab_scraper/features/equity/parser.py deleted file mode 100644 index 4da8c5a..0000000 --- a/schwab_scraper/features/equity/parser.py +++ /dev/null @@ -1,80 +0,0 @@ -import re -from io import BytesIO -from typing import Dict -import pdfplumber - - -def clean_value(label: str, value: str) -> str: - """Cleans the extracted value based on the label.""" - if label == "Morningstar Rating": - return f"{value.count('Q')} stars" - if label == "Economic Moat": - if "Wide" in value: - return "Wide" - if "Narrow" in value: - return "Narrow" - if "None" in value: - return "None" - if label in ["Fair Value", "1-Star Price", "5-Star Price"]: - match = re.match(r"[\d,]+\.\d{2}", value) - if match: - return match.group(0) - if label == "Assessment": - return value.split()[0] - if label == "52-Week-Range": - return value.replace('\u2014', '-') - if label == "52-Week Range": - return value.replace('\u2014', '-') - return value - - -def parse(pdf_content: bytes) -> Dict[str, str]: - """ - Parses a Morningstar PDF report to extract key data points. - Returns a dict keyed by the label names present in the report. - """ - with pdfplumber.open(BytesIO(pdf_content)) as pdf: - page = pdf.pages[2] # Page 3 - words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False) - - data: Dict[str, str] = {} - labels = [ - "Fair Value", "1-Star Price", "5-Star Price", "Assessment", - "Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style", - "Economic Moat", "Morningstar Rating" - ] - - for i, word in enumerate(words): - # Combine words to form potential labels - for j in range(i + 1, min(i + 4, len(words))): - potential_label = " ".join(w['text'] for w in words[i:j]) - if potential_label in labels: - if potential_label == "Economic Moat": - # Find the value to the right of the label - label_end_x = words[j-1]['x1'] - value_words = [ - w['text'] for w in words[j:] - if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100 - ] - if value_words: - value = " ".join(value_words) - if "Wide" in value: - data[potential_label] = "Wide" - elif "Narrow" in value: - data[potential_label] = "Narrow" - elif "None" in value: - data[potential_label] = "None" - break - else: - # Find the value to the right of the label - label_end_x = words[j-1]['x1'] - value_words = [ - w['text'] for w in words[j:] - if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100 - ] - if value_words: - # Join the value words and clean them - value = " ".join(value_words) - data[potential_label] = clean_value(potential_label, value) - break # Move to the next word once a label is found - return data diff --git a/schwab_scraper/features/equity/phase1_api_scraper.py b/schwab_scraper/features/equity/phase1_api_scraper.py deleted file mode 100644 index 7477630..0000000 --- a/schwab_scraper/features/equity/phase1_api_scraper.py +++ /dev/null @@ -1,490 +0,0 @@ -"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL) - -⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️ - -This module was an attempt to extract equity data by calling Schwab's REST APIs directly. -While the APIs exist and were discovered via HAR analysis, they are NOT accessible from -this scraper due to fundamental browser security limitations (CORS). - -## Why This Approach Failed: - -1. **CORS (Cross-Origin Resource Sharing) Restrictions**: - - Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins) - - Browser blocks cross-origin fetch() calls even from page.evaluate() - - Results in "TypeError: Failed to fetch" - -2. **Authentication Complexity**: - - Direct HTTP (aiohttp) with cookies: 401/403 errors - - Playwright page.request.fetch(): 401 errors (separate context) - - Likely requires dynamic tokens beyond cookies - -## Recommendation: - -**Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated -sessions and extracts all Phase 1 fields without CORS limitations. - -## API Endpoints (discovered but inaccessible): -- Quote: /api/is.ResearchExperience/v1/quote -- Dividends: /api/is.ResearchExperience/v1/events/dividends -- Earnings: /api/is.ResearchExperience/v1/events/earnings -- Share Profile: /api/is.ResearchExperience/v1/shareprofile -""" - -from typing import Dict, Any, Optional, List -import logging -import uuid -import aiohttp -from playwright.async_api import Page - -from ...core import ( - QuoteData, EnhancedDividends, EarningsData, - CalculatedMetrics, EquityPhase1Data -) - -logger = logging.getLogger(__name__) - - -def _parse_float(value: Any) -> Optional[float]: - """Safely parse a value to float.""" - if value is None: - return None - try: - if isinstance(value, str): - # Remove % sign if present - value = value.replace('%', '').strip() - return float(value) - except (ValueError, TypeError): - return None - - -def _parse_market_cap(value: str) -> Optional[str]: - """Parse market cap string like '$3.03T' or '$462.11B'.""" - if not value: - return None - # Keep the formatted string as-is for readability - return value.strip() - - -def _parse_volume(value: Any) -> Optional[int]: - """Parse volume value.""" - if value is None: - return None - try: - return int(float(value)) - except (ValueError, TypeError): - return None - - -def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData: - """Parse quote API response into QuoteData object. - - API Response Structure: - { - "reference": { - "symbol": "JNJ", - "companyName": "JOHNSON & JOHNSON", - "exchangeName": "NYSE" - }, - "quote": { - "lastPrice": 193.155, - "netChange": 1.275, - "netChangePercent": 0.6644778, - "postMarketChange": 0.0, - "postMarketPercentChange": 0.0, - "tradeTime": "2025-10-22T17:06:42.008Z" - }, - "regularQuote": { - "lastPrice": 193.155, - "lastSize": 100.0, - "netChange": 1.275, - "percentChange": 0.6644778, - ... - } - } - """ - quote = QuoteData() - - try: - reference = data.get('reference', {}) - quote_data = data.get('quote', {}) - regular_quote = data.get('regularQuote', {}) - - # Basic info - quote.exchange = reference.get('exchangeName') - - # Price data - quote.price = _parse_float(quote_data.get('lastPrice')) - quote.change = _parse_float(quote_data.get('netChange')) - quote.change_percent = _parse_float(quote_data.get('netChangePercent')) - - # After hours (post market) - quote.after_hours_change = _parse_float(quote_data.get('postMarketChange')) - quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange')) - - # Extended quote data - quote.previous_close = _parse_float(regular_quote.get('closePrice')) - quote.open = _parse_float(regular_quote.get('openPrice')) - quote.bid = _parse_float(regular_quote.get('bidPrice')) - quote.ask = _parse_float(regular_quote.get('askPrice')) - quote.volume = _parse_volume(regular_quote.get('totalVolume')) - quote.day_range_low = _parse_float(regular_quote.get('lowPrice')) - quote.day_range_high = _parse_float(regular_quote.get('highPrice')) - quote.week_52_low = _parse_float(regular_quote.get('priceLow52W')) - quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W')) - - # Bid/Ask size - bid_size = regular_quote.get('bidSize', 0) - ask_size = regular_quote.get('askSize', 0) - if bid_size or ask_size: - quote.bid_ask_size = f"{bid_size}/{ask_size}" - - # Volume vs average - avg_volume_label = regular_quote.get('averageVolumeDaily') - if avg_volume_label: - quote.volume_vs_avg = avg_volume_label - - except Exception as e: - logger.debug(f"Error parsing quote API response: {e}") - - return quote - - -def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends: - """Parse dividends API response into EnhancedDividends object. - - API Response Structure: - { - "symbol": "JNJ", - "currentAnnualDividendMethod": "IAD", - "status": "DIVIDENDS_PAID_CURRENTLY", - "dividends": [ - { - "dividendPayment": 1.3, - "dividendPayDate": "December 09, 2025", - "dividendExDate": "November 25, 2025", - "dividendFrequency": "Quarterly", - "annualDividendRate": 5.2, - "dividendYield": "2.71%" - }, - ... - ] - } - """ - dividends = EnhancedDividends() - - try: - dividend_list = data.get('dividends', []) - if not dividend_list: - return dividends - - # Most recent dividend is first - latest = dividend_list[0] - - # Next/upcoming dividend data - dividends.next_payment = _parse_float(latest.get('dividendPayment')) - dividends.next_pay_date = latest.get('dividendPayDate') - dividends.next_ex_date = latest.get('dividendExDate') - dividends.frequency = latest.get('dividendFrequency') - dividends.annual_rate = _parse_float(latest.get('annualDividendRate')) - dividends.annual_yield = _parse_float(latest.get('dividendYield')) - - # Previous dividend (if there's more than one in history) - if len(dividend_list) > 1: - previous = dividend_list[1] - dividends.previous_payment = _parse_float(previous.get('dividendPayment')) - dividends.previous_pay_date = previous.get('dividendPayDate') - dividends.previous_ex_date = previous.get('dividendExDate') - - except Exception as e: - logger.debug(f"Error parsing dividends API response: {e}") - - return dividends - - -def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData: - """Parse earnings API response into EarningsData object. - - API Response Structure: - { - "symbol": "GOOGL", - "fundamentals": {}, - "upcoming": { - "earningsDate": "10/29/2025", - "numberOfAnalysts": 43, - "epsNonGaapEstimate": 2.18 - }, - "historical": [ - { - "epsGaapActual": 2.31, - "epsNonGaapActual": 2.31, - "earningsDate": "07/23/2025", - "numberOfAnalysts": 43, - "epsNonGaapEstimate": 2.18, - "epsNonGaapEstimateHigh": 2.42, - "epsNonGaapEstimateLow": 2.0 - } - ] - } - """ - earnings = EarningsData() - - try: - upcoming = data.get('upcoming', {}) - historical = data.get('historical', []) - fundamentals = data.get('fundamentals', {}) - - # Upcoming earnings - if upcoming: - earnings.next_announcement_date = upcoming.get('earningsDate') - earnings.announcement_timing = upcoming.get('announcementTiming') - earnings.analysts_covering = upcoming.get('numberOfAnalysts') - earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate')) - earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh')) - earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow')) - - # Historical earnings (most recent) - if historical: - latest = historical[0] - earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual')) - - # If we don't have upcoming, use latest historical for analyst data - if not upcoming: - earnings.analysts_covering = latest.get('numberOfAnalysts') - earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate')) - earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh')) - earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow')) - - # Beat/miss information - beat_amount = latest.get('epsNonGaapBeat') - if beat_amount is not None: - earnings.recent_beats = [{ - 'beat_amount': _parse_float(beat_amount), - 'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')), - 'date': latest.get('earningsDate') - }] - - # Fundamentals (PE ratios, revenue) - if fundamentals: - earnings.pe_ttm = _parse_float(fundamentals.get('peRatio')) - earnings.forward_pe = _parse_float(fundamentals.get('forwardPE')) - earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio')) - earnings.revenue_ttm = _parse_float(fundamentals.get('revenue')) - - except Exception as e: - logger.debug(f"Error parsing earnings API response: {e}") - - return earnings - - -def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData: - """Parse share profile API response and enhance QuoteData with market cap, etc. - - API Response Structure: - { - "companySummary": { - "marketCapLabel": "Large Cap", - "marketCapValue": "$462.11B", - "companyEnterpriseValue": "$462.11B" - }, - "shareInfo": [{ - "sharesOutstanding": "2.41B", - "sharesHeld": "71.29%" - }] - } - """ - try: - company_summary = data.get('companySummary', {}) - - # Market cap - quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue')) - - # Sector info might be in other fields - # Note: Sector information may not be in shareprofile API - # It might be in securityprofiles or other endpoints - - except Exception as e: - logger.debug(f"Error parsing share profile API response: {e}") - - return quote - - -def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]: - """Calculate dividend payout ratio. - - Formula: (Annual Dividend Rate / EPS TTM) × 100 - """ - if annual_dividend and eps_ttm and eps_ttm > 0: - ratio = (annual_dividend / eps_ttm) * 100 - return round(ratio, 2) - return None - - -async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]: - """Call a Schwab API endpoint from within the browser's JavaScript context. - - This uses page.evaluate() to run fetch() directly in the browser, which ensures - all cookies, authentication tokens, and session state are automatically included. - This is the most reliable way to call Schwab APIs. - - Args: - page: Playwright page with authenticated session - url: API endpoint URL - debug: Enable debug logging - - Returns: - Parsed JSON response or None on error - """ - try: - if debug: - logger.debug(f"Calling API: {url}") - - # Generate correlation IDs - correlator_id = str(uuid.uuid4()) - client_correlid = str(uuid.uuid4()) - - # Call API from within browser's JavaScript context using fetch() - # This automatically includes all cookies and session state - result = await page.evaluate(""" - async ({url, correlatorId, clientCorrelId}) => { - try { - const response = await fetch(url, { - method: 'GET', - credentials: 'include', // Include cookies - headers: { - 'accept': 'application/json', - 'accept-language': 'en-US,en;q=0.9', - 'cache-control': 'no-cache', - 'content-type': 'application/json', - 'correlatorid': correlatorId, - 'pragma': 'no-cache', - 'schwab-client-appid': 'AD00007800', - 'schwab-client-channel': 'IO', - 'schwab-client-correlid': clientCorrelId, - 'schwab-resource-version': '2', - } - }); - - if (!response.ok) { - const errorText = await response.text(); - return { - success: false, - status: response.status, - error: errorText - }; - } - - const data = await response.json(); - return { - success: true, - status: response.status, - data: data - }; - } catch (error) { - return { - success: false, - error: error.toString() - }; - } - } - """, {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid}) - - if not result.get('success'): - if debug: - status = result.get('status', 'unknown') - error = result.get('error', 'unknown error') - logger.debug(f"API returned status {status}: {str(error)[:200]}") - return None - - data = result.get('data') - - if debug and data: - logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}") - - return data - - except Exception as e: - if debug: - logger.debug(f"Error calling API {url}: {e}") - return None - - -async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data: - """Extract Phase 1 data using Schwab's REST APIs. - - This is the API-based replacement for the DOM scraping approach. - It calls Schwab's APIs directly using the authenticated session. - - Args: - page: Playwright page with authenticated session - ticker: Stock ticker symbol - debug: Enable debug logging - - Returns: - EquityPhase1Data with all extracted fields - """ - if debug: - logger.debug(f"Starting API-based Phase 1 extraction for {ticker}") - - base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1" - - # Build API URLs - quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true" - dividends_url = f"{base_url}/events/dividends?symbol={ticker}" - earnings_url = f"{base_url}/events/earnings?symbols={ticker}" - profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true" - - # Make API calls using Playwright's request context (includes cookies automatically) - quote_data = await call_schwab_api(page, quote_url, debug) - dividends_data = await call_schwab_api(page, dividends_url, debug) - earnings_data = await call_schwab_api(page, earnings_url, debug) - profile_data = await call_schwab_api(page, profile_url, debug) - - # Parse responses - # Quote API returns a list, get first item - if quote_data and isinstance(quote_data, list) and len(quote_data) > 0: - quote = parse_quote_api_response(quote_data[0]) - elif quote_data and isinstance(quote_data, dict): - quote = parse_quote_api_response(quote_data) - else: - quote = QuoteData() - - # Enhance quote with share profile data - if profile_data: - quote = parse_shareprofile_api_response(profile_data, quote) - - # Parse dividends - dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends() - - # Parse earnings - earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData() - - # Calculate derived metrics - calculated = CalculatedMetrics() - if dividends.annual_rate and earnings.eps_ttm: - calculated.payout_ratio = calculate_payout_ratio( - dividends.annual_rate, - earnings.eps_ttm - ) - - # Create Phase 1 data object - phase1_data = EquityPhase1Data( - ticker=ticker, - quote=quote, - dividends=dividends, - earnings=earnings, - calculated_metrics=calculated - ) - - if debug: - logger.debug(f"API-based Phase 1 extraction complete for {ticker}") - # Count populated fields (dataclasses with slots don't have __dict__) - from dataclasses import fields as dataclass_fields - quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None) - div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None) - earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, [])) - logger.debug(f" Quote fields populated: {quote_count}/21") - logger.debug(f" Dividend fields populated: {div_count}/9") - logger.debug(f" Earnings fields populated: {earn_count}/13") - - return phase1_data - diff --git a/schwab_scraper/features/equity/phase1_scraper.py b/schwab_scraper/features/equity/phase1_scraper.py deleted file mode 100644 index fee041d..0000000 --- a/schwab_scraper/features/equity/phase1_scraper.py +++ /dev/null @@ -1,786 +0,0 @@ -"""Phase 1: Essential Dividend Metrics Implementation (DEPRECATED) - -⚠️ DEPRECATED: This DOM-scraping based approach has been replaced by phase1_api_scraper.py -which uses Schwab's REST APIs directly. The API approach is more reliable, complete, -and maintainable than DOM scraping. - -This module is kept for reference only. New code should use phase1_api_scraper.py. - -Old approach extracts from DOM: -- Quote/Price Data (symbol bar) -- Enhanced Dividend Information (forward-looking dates) -- Core Earnings Metrics (EPS, forecasts) -- Basic Valuation Ratios (P/E, Forward P/E, PEG) -- Calculated Metrics (payout ratio) -""" - -from typing import Dict, Any, Optional -import re -import logging - -from ...core import QuoteData, EnhancedDividends, EarningsData, CalculatedMetrics, EquityPhase1Data - - -logger = logging.getLogger(__name__) - - -def _parse_float(value: Any) -> Optional[float]: - """Safely parse a value to float, handling $ and % symbols.""" - if value is None: - return None - try: - # Remove common formatting characters - clean = str(value).strip().replace('$', '').replace(',', '').replace('%', '') - if clean and clean != '--' and clean.lower() != 'n/a': - return float(clean) - except (ValueError, AttributeError): - pass - return None - - -def _parse_int(value: Any) -> Optional[int]: - """Safely parse a value to int.""" - if value is None: - return None - try: - clean = str(value).strip().replace(',', '') - if clean and clean != '--' and clean.lower() != 'n/a': - return int(float(clean)) - except (ValueError, AttributeError): - pass - return None - - -def _parse_volume(volume_str: str) -> Optional[int]: - """Parse volume string like '8M', '22.4M', '1.2B' to integer.""" - if not volume_str: - return None - - try: - volume_str = volume_str.strip().upper() - multiplier = 1 - - if volume_str.endswith('K'): - multiplier = 1_000 - volume_str = volume_str[:-1] - elif volume_str.endswith('M'): - multiplier = 1_000_000 - volume_str = volume_str[:-1] - elif volume_str.endswith('B'): - multiplier = 1_000_000_000 - volume_str = volume_str[:-1] - - value = float(volume_str) - return int(value * multiplier) - except (ValueError, AttributeError): - return None - - -def _parse_revenue(revenue_str: str) -> Optional[float]: - """Parse revenue string like '$92.15B', '$1.5M' to dollar value.""" - if not revenue_str: - return None - - try: - revenue_str = revenue_str.strip().upper().replace('$', '').replace(',', '') - multiplier = 1 - - if revenue_str.endswith('K'): - multiplier = 1_000 - revenue_str = revenue_str[:-1] - elif revenue_str.endswith('M'): - multiplier = 1_000_000 - revenue_str = revenue_str[:-1] - elif revenue_str.endswith('B'): - multiplier = 1_000_000_000 - revenue_str = revenue_str[:-1] - elif revenue_str.endswith('T'): - multiplier = 1_000_000_000_000 - revenue_str = revenue_str[:-1] - - value = float(revenue_str) - return value * multiplier - except (ValueError, AttributeError): - return None - - -async def extract_quote_data(page, ticker: str = "", debug: bool = False) -> QuoteData: - """Extract quote/price data from symbol bar. - - Args: - page: Playwright page object - ticker: Stock ticker symbol (for pattern matching) - debug: Enable debug logging - - Returns: - QuoteData object with extracted fields - """ - quote = QuoteData() - - try: - if debug: - logger.debug("Starting quote data extraction...") - - # Wait for symbol bar content (look for key labels) - try: - await page.wait_for_selector('#app-symbol-bar-component, text=Previous close', state='attached', timeout=15000) - except Exception: - if debug: - logger.debug("Timeout waiting for symbol bar selector, attempting to parse whatever is there") - - # Extract symbol bar text content (fallback to body if specific component not found) - symbol_bar_text = await page.evaluate(''' - () => { - const symbolBar = document.querySelector('#app-symbol-bar-component'); - if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) return symbolBar.textContent; - - // If specific component not found, try to find the container with market data - // Look for container with "Previous close" - const labels = Array.from(document.querySelectorAll('span, div, p')); - const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close')); - if (prevCloseLabel) { - // Return the parent's text content (go up a few levels to capture all data) - let parent = prevCloseLabel.parentElement; - let count = 0; - while (parent && count < 8) { - if (parent.textContent.length > 300) return parent.textContent; - parent = parent.parentElement; - count++; - } - } - - return document.body.textContent || ''; - } - ''') - - if debug: - logger.debug(f"Symbol bar text (first 500 chars): {symbol_bar_text[:500]}") - - # Extract structured data - quote_data = await page.evaluate(r''' - (ticker) => { - const data = {}; - - // Helper to get text content from page - const getText = () => { - const symbolBar = document.querySelector('#app-symbol-bar-component'); - // Verify it looks like the right component by checking for "Previous close" - if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) { - return symbolBar.textContent; - } - - // Fallback logic - const labels = Array.from(document.querySelectorAll('span, div, p')); - const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close')); - if (prevCloseLabel) { - let parent = prevCloseLabel.parentElement; - let count = 0; - while (parent && count < 8) { - if (parent.textContent.length > 300) return parent.textContent; - parent = parent.parentElement; - count++; - } - } - - // Last resort: body text - return document.body.textContent || ''; - }; - - const fullText = getText(); - - // Try to find price in quote container first for accuracy - const priceElement = document.querySelector('.symbol-quote-container, [data-testid="quote-price"]'); - if (priceElement) { - const priceText = priceElement.textContent || ''; - const priceMatch = priceText.match(/\$([0-9,]+\.[0-9]+)/); - if (priceMatch) data.price = priceMatch[1].replace(',', ''); - } else { - // Fallback regex for price if element not found - // Look for price near top or just regex - const priceMatch = fullText.match(/\$([0-9,]+\.[0-9]{2})(\s|[+-]|$)/); - if (priceMatch) data.price = priceMatch[1].replace(',', ''); - } - - // After hours (using \s* for robustness) - const afterHoursMatch = fullText.match(/After hours:?\s*\$([0-9,.]+)/i); - if (afterHoursMatch) data.after_hours_price = afterHoursMatch[1].replace(',', ''); - - const afterHoursChangeMatch = fullText.match(/After hours:.*?([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/i); - if (afterHoursChangeMatch) { - data.after_hours_change = afterHoursChangeMatch[1].replace('$', '').replace(',', ''); - data.after_hours_change_percent = afterHoursChangeMatch[2]; - } - - // Bid/Ask (using \s* for robustness) - const bidMatch = fullText.match(/Bid\s*\$([0-9,.]+)/i); - if (bidMatch) data.bid = bidMatch[1].replace(',', ''); - - const askMatch = fullText.match(/Ask\s*\$([0-9,.]+)/i); - if (askMatch) data.ask = askMatch[1].replace(',', ''); - - const bidAskSizeMatch = fullText.match(/Bid\/Ask Size\s*([0-9]+\/[0-9]+)/i); - if (bidAskSizeMatch) data.bid_ask_size = bidAskSizeMatch[1]; - - // Previous close and open (using \s* instead of \s+) - const prevCloseMatch = fullText.match(/Previous close\s*\$([0-9,.]+)/i); - if (prevCloseMatch) data.previous_close = prevCloseMatch[1].replace(',', ''); - - const openMatch = fullText.match(/Today's open\s*\$([0-9,.]+)/i); - if (openMatch) data.open = openMatch[1].replace(',', ''); - - // Volume (using \s*) - const volumeMatch = fullText.match(/Today's volume\s*([0-9.]+[KMB]?)/i); - if (volumeMatch) data.volume = volumeMatch[1]; - - const volumeVsAvgMatch = fullText.match(/Today's volume\s*[0-9.]+[KMB]?\s*(Above Avg\.|Below Avg\.|Average)/i); - if (volumeVsAvgMatch) data.volume_vs_avg = volumeVsAvgMatch[1]; - - // Day range - // Pattern: "Today's range low $200.81 Today's range high $203.45" or similar - // We'll look for "low $X" and "high $Y" appearing after "Today's range" - const dayRangeMatch = fullText.match(/Today's range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i); - if (dayRangeMatch) { - data.day_range_low = dayRangeMatch[1].replace(',', ''); - data.day_range_high = dayRangeMatch[2].replace(',', ''); - } - - // 52-week range - const weekRangeMatch = fullText.match(/52-week range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i); - if (weekRangeMatch) { - data.week_52_low = weekRangeMatch[1].replace(',', ''); - data.week_52_high = weekRangeMatch[2].replace(',', ''); - } - - // Market cap (may be in Share Profile section) - const marketCapMatch = fullText.match(/Market Cap\s*\$([0-9.]+[KMBT])/i); - if (marketCapMatch) data.market_cap = marketCapMatch[1]; - - // Change and change percent - - // Try specific formatted pattern first: TICKER $PRICE CHANGE CHANGE% - // e.g. "JNJ $201.95 -1.03 -0.51%" - const standardPattern = fullText.match(/\$([0-9,.]+)\s*([+-]?[0-9,.]+)\s*([+-]?[0-9.]+)%/); - if (standardPattern) { - if (!data.price) data.price = standardPattern[1].replace(',', ''); - data.change = standardPattern[2]; - data.change_percent = standardPattern[3]; - } - - let percentMatch = null; - if (ticker && !data.change_percent) { - // Match: TICKER$digits.digits{2}percent% - const tickerPattern = new RegExp(ticker + '\\\\.?[\\s]*\\$([0-9,]+\\\\.[0-9]{2})[\\s]*([0-9.]+)%', 'i'); - percentMatch = fullText.match(tickerPattern); - if (percentMatch) { - data.change_percent = percentMatch[2]; - } - } - - if (!data.change_percent) { - // Fallback: match any price+percent pattern with space - const fallbackMatch = fullText.match(/\$[0-9,.]+\s*([+-]?[0-9.]+)%/); - if (fallbackMatch) { - data.change_percent = fallbackMatch[1]; - } - } - - // Pattern 2: "+$1.23 (+0.45%)" or "-$1.23 (-0.45%)" - let changeMatch = fullText.match(/([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/); - // Pattern 3: "$193.08 +1.23 +0.64%" (price followed by change) - if (!changeMatch) { - changeMatch = fullText.match(/\$[0-9,.]+\s*([+-][0-9,.]+)\s*([+-][0-9.]+)%/); - } - // Pattern 4: "Change: +1.23 (+0.64%)" - if (!changeMatch) { - changeMatch = fullText.match(/Change:?\s*([+-][0-9,.]+)\s*\(([+-][0-9.]+)%\)/i); - } - if (changeMatch) { - data.change = changeMatch[1].replace('$', '').replace(',', ''); - if (!data.change_percent) { - data.change_percent = changeMatch[2].replace(/[+]/g, ''); - } - } - - // Exchange - look for NYSE, NASDAQ, etc. - const exchangeMatch = fullText.match(/\b(NYSE|NASDAQ|AMEX|OTC|BATS)\b/i); - if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase(); - - return data; - } - ''', ticker) - - # Parse and assign values - quote.price = _parse_float(quote_data.get('price')) - quote.change = _parse_float(quote_data.get('change')) - quote.change_percent = _parse_float(quote_data.get('change_percent')) - quote.after_hours_price = _parse_float(quote_data.get('after_hours_price')) - quote.after_hours_change = _parse_float(quote_data.get('after_hours_change')) - quote.after_hours_change_percent = _parse_float(quote_data.get('after_hours_change_percent')) - quote.bid = _parse_float(quote_data.get('bid')) - quote.ask = _parse_float(quote_data.get('ask')) - quote.bid_ask_size = quote_data.get('bid_ask_size') - quote.previous_close = _parse_float(quote_data.get('previous_close')) - quote.open = _parse_float(quote_data.get('open')) - quote.volume = _parse_volume(quote_data.get('volume', '')) - quote.volume_vs_avg = quote_data.get('volume_vs_avg') - quote.day_range_low = _parse_float(quote_data.get('day_range_low')) - quote.day_range_high = _parse_float(quote_data.get('day_range_high')) - quote.week_52_low = _parse_float(quote_data.get('week_52_low')) - quote.week_52_high = _parse_float(quote_data.get('week_52_high')) - quote.market_cap = quote_data.get('market_cap') - - # Try to extract sector and exchange from page header - header_data = await page.evaluate(r''' - () => { - const data = {}; - - // Look for sector near company name - const sectorElement = document.querySelector('[data-testid="sector"], .sector'); - if (sectorElement) { - data.sector = sectorElement.textContent.replace('Sector', '').trim(); - } else { - // Manual search for text containing "Sector" - const spans = Array.from(document.querySelectorAll('span')); - const sectorSpan = spans.find(el => el.textContent && el.textContent.includes('Sector')); - if (sectorSpan) { - data.sector = sectorSpan.textContent.replace('Sector', '').replace(':', '').trim(); - } - } - - // Look for exchange near ticker - const exchangeElement = document.querySelector('[data-testid="exchange"], .exchange'); - if (exchangeElement) { - data.exchange = exchangeElement.textContent.trim(); - } - - // Fallback: parse from page text - const pageText = document.body.textContent || ''; - if (!data.sector) { - const sectorMatch = pageText.match(/Sector[:\s]+([A-Za-z\s&]+)/); - if (sectorMatch) data.sector = sectorMatch[1].trim(); - } - if (!data.exchange) { - const exchangeMatch = pageText.match(/(NYSE|NASDAQ|AMEX|OTC)/i); - if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase(); - } - - return data; - } - ''') - - quote.sector = header_data.get('sector') - quote.exchange = header_data.get('exchange') - - if debug: - logger.debug(f"Extracted quote data: price={quote.price}, volume={quote.volume}, " - f"52w_range={quote.week_52_low}-{quote.week_52_high}") - - except Exception as e: - if debug: - logger.debug(f"Error extracting quote data: {e}") - - return quote - - -async def extract_enhanced_dividends(page, debug: bool = False) -> EnhancedDividends: - """Extract enhanced dividend data including next payment dates. - - Args: - page: Playwright page object - debug: Enable debug logging - - Returns: - EnhancedDividends object with extracted fields - """ - dividends = EnhancedDividends() - - try: - if debug: - logger.debug("Starting enhanced dividend extraction...") - - # Wait for dividends panel to load - await page.wait_for_selector('#dividends', timeout=15000) - - # Scroll to dividends panel - await page.evaluate(''' - () => { - const dividendsPanel = document.querySelector('#dividends'); - if (dividendsPanel) { - dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' }); - } - } - ''') - await page.wait_for_timeout(1000) - - # CRITICAL: Click on the panel header to trigger content loading - # Schwab's panels don't auto-load - they need to be clicked - if debug: - logger.debug("Clicking dividends panel header to trigger content load...") - try: - dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title, #dividends-togglechevron-button') - if dividends_header: - await dividends_header.click() - await page.wait_for_timeout(2000) - if debug: - logger.debug("Clicked dividends panel header successfully") - except Exception as e: - if debug: - logger.debug(f"Could not click dividends header: {e}") - - # Wait for content to load after click - await page.wait_for_timeout(1000) - - # Extract dividend data - dividend_data = await page.evaluate(''' - () => { - const data = {}; - const dividendsPanel = document.querySelector('#dividends'); - if (!dividendsPanel) return data; - - const fullText = dividendsPanel.textContent || ''; - - // DEBUG: Return sample of text for debugging - data._debug_text_sample = fullText.substring(0, 800); - - // Next dividend payment - const nextPaymentMatch = fullText.match(/Next Dividend Payment\\s*\\$([0-9.]+)/i); - if (nextPaymentMatch) data.next_payment = nextPaymentMatch[1]; - - // Next pay date - const nextPayDateMatch = fullText.match(/Next Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i); - if (nextPayDateMatch) data.next_pay_date = nextPayDateMatch[1]; - - // Next ex-date - const nextExDateMatch = fullText.match(/Next Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i); - if (nextExDateMatch) data.next_ex_date = nextExDateMatch[1]; - - // Previous dividend payment - const prevPaymentMatch = fullText.match(/Previous Dividend Payment\\s*\\$([0-9.]+)/i); - if (prevPaymentMatch) data.previous_payment = prevPaymentMatch[1]; - - // Previous pay date - const prevPayDateMatch = fullText.match(/Previous Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i); - if (prevPayDateMatch) data.previous_pay_date = prevPayDateMatch[1]; - - // Previous ex-date - const prevExDateMatch = fullText.match(/Previous Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i); - if (prevExDateMatch) data.previous_ex_date = prevExDateMatch[1]; - - // Frequency - const frequencyMatch = fullText.match(/Frequency\\s*(Quarterly|Monthly|Annual|Semi-Annual)/i); - if (frequencyMatch) data.frequency = frequencyMatch[1]; - - // Annual Dividend Rate (IAD) - const annualRateMatch = fullText.match(/Annual Dividend Rate.*?\\$([0-9.]+)/i); - if (annualRateMatch) data.annual_rate = annualRateMatch[1]; - - // Annual Dividend Yield - appears after "Annual Dividend Yield" text - // Text pattern: "Annual Dividend Yield...2.71%" - const yieldMatch = fullText.match(/Annual Dividend Yield[\\s\\S]{0,300}?([0-9]+\\.[0-9]+)%/i); - if (yieldMatch) data.annual_yield = yieldMatch[1]; - - return data; - } - ''') - - if debug and dividend_data.get('_debug_text_sample'): - logger.debug(f"Dividend panel text sample: {dividend_data['_debug_text_sample']}") - - # Parse and assign values - dividends.next_payment = _parse_float(dividend_data.get('next_payment')) - dividends.next_pay_date = dividend_data.get('next_pay_date') - dividends.next_ex_date = dividend_data.get('next_ex_date') - dividends.previous_payment = _parse_float(dividend_data.get('previous_payment')) - dividends.previous_pay_date = dividend_data.get('previous_pay_date') - dividends.previous_ex_date = dividend_data.get('previous_ex_date') - dividends.frequency = dividend_data.get('frequency') - dividends.annual_rate = _parse_float(dividend_data.get('annual_rate')) - dividends.annual_yield = _parse_float(dividend_data.get('annual_yield')) - - if debug: - logger.debug(f"Extracted dividend data: next_payment={dividends.next_payment}, " - f"next_pay_date={dividends.next_pay_date}, annual_rate={dividends.annual_rate}") - - except Exception as e: - if debug: - logger.debug(f"Error extracting dividend data: {e}") - - return dividends - - -async def extract_earnings_data(page, debug: bool = False) -> EarningsData: - """Extract earnings metrics and forecasts. - - Args: - page: Playwright page object - debug: Enable debug logging - - Returns: - EarningsData object with extracted fields - """ - earnings = EarningsData() - - try: - if debug: - logger.debug("Starting earnings data extraction...") - - # Wait for earnings panel to load - await page.wait_for_selector('#expected-earnings', timeout=15000) - - # Scroll to earnings panel - await page.evaluate(''' - () => { - const earningsPanel = document.querySelector('#expected-earnings'); - if (earningsPanel) { - earningsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' }); - } - } - ''') - await page.wait_for_timeout(1000) - - # CRITICAL: Click on the panel header to trigger content loading - # Schwab's panels don't auto-load - they need to be clicked - if debug: - logger.debug("Clicking earnings panel header to trigger content load...") - try: - earnings_header = await page.query_selector('#expected-earnings h2, #expected-earnings .sdps-panel__title, #expected-earnings-heading, #expected-earnings-togglechevron-button') - if earnings_header: - await earnings_header.click() - await page.wait_for_timeout(2000) - if debug: - logger.debug("Clicked earnings panel header successfully") - except Exception as e: - if debug: - logger.debug(f"Could not click earnings header: {e}") - - # Wait for content to load after click - await page.wait_for_timeout(1000) - - # Check for and click "Show More" if present - try: - # Use JS to find and click - most robust way - clicked = await page.evaluate(''' - () => { - const panel = document.querySelector('#expected-earnings'); - if (!panel) return false; - - // Find any element with "Show More" text - const elements = Array.from(panel.querySelectorAll('a, button, span, div')); - const showMore = elements.find(el => el.textContent.trim().toLowerCase() === "show more"); - - if (showMore) { - showMore.click(); - return true; - } - return false; - } - ''') - - if clicked: - if debug: - logger.debug("found and clicked 'Show More' via JS") - await page.wait_for_timeout(2000) - elif debug: - logger.debug("'Show More' not found or not clickable") - - except Exception as e: - if debug: - logger.debug(f"Error checking for Show More: {e}") - - # Extract earnings data - earnings_data = await page.evaluate(r''' - (debug) => { - const data = {}; - // Helper to get text content including Shadow DOMs - const getDeepText = (root) => { - if (!root) return ''; - if (root.nodeType === Node.TEXT_NODE) return root.textContent; - if (root.nodeType === Node.ELEMENT_NODE && root.shadowRoot) { - return getDeepText(root.shadowRoot); - } - - let text = ''; - const children = root.childNodes; - for (let i = 0; i < children.length; i++) { - text += getDeepText(children[i]); - } - return text; - }; - - const earningsPanel = document.querySelector('#expected-earnings'); - let fullText = ''; - - if (earningsPanel) { - fullText = getDeepText(earningsPanel); - } - - // Fallback to body deep text if panel seems empty - if (fullText.length < 500 || !fullText.includes("Announcement")) { - fullText = getDeepText(document.body); - } - - // Next earnings announcement - robust regex checking for various patterns - let nextAnnouncementMatch = fullText.match(/Next Earnings Announcement.*?([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i); - if (!nextAnnouncementMatch) { - // Try alternate pattern: Announcement: 12/12/2025 - nextAnnouncementMatch = fullText.match(/Announcement:?\s*([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i); - } - if (nextAnnouncementMatch) data.next_announcement_date = nextAnnouncementMatch[1]; - - // Announcement timing - const timingMatch = fullText.match(/(Before Market Open|After Market Close)/i); - if (timingMatch) data.announcement_timing = timingMatch[1]; - - // Number of analysts - const analystsMatch = fullText.match(/With ([0-9]+) analysts covering/i); - if (analystsMatch) data.analysts_covering = analystsMatch[1]; - - // Consensus estimate - const consensusMatch = fullText.match(/consensus.*?estimate is \\$([0-9.]+)/i); - if (consensusMatch) data.consensus_estimate = consensusMatch[1]; - - // High/Low estimates - const highLowMatch = fullText.match(/high and low estimates are \\$([0-9.]+) and \\$([0-9.]+)/i); - if (highLowMatch) { - data.estimate_high = highLowMatch[1]; - data.estimate_low = highLowMatch[2]; - } - - // EPS TTM (multiple patterns) - let epsMatch = fullText.match(/EPS\s*\(TTM\)\s*(?:Value)?\s*\$?([0-9.-]+)/i); - if (!epsMatch) epsMatch = fullText.match(/Earnings per Share\s*\(?TTM\)?\s*(?:Value)?\s*\$?([0-9.-]+)/i); - if (!epsMatch) epsMatch = fullText.match(/EPS\s+(?:Value)?\s*([0-9.-]+)/i); - if (epsMatch) data.eps_ttm = epsMatch[1]; - - // Revenue TTM - let revenueMatch = fullText.match(/Revenue\s*\(TTM\)\s*(?:Value)?\s*\$([0-9.]+[KMBT]?)/i); - if (!revenueMatch) revenueMatch = fullText.match(/Revenue\s+(?:Value)?\s*\$([0-9.]+[KMBT])/i); - if (revenueMatch) data.revenue_ttm = revenueMatch[1]; - - // P/E TTM (multiple patterns) - let peMatch = fullText.match(/Price[\/\s]*Earnings\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i); - if (!peMatch) peMatch = fullText.match(/P[\/\s]*E\s*\(?TTM\)?\s*(?:Value)?\s*([0-9.]+)/i); - if (!peMatch) peMatch = fullText.match(/PE Ratio\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i); - if (peMatch) data.pe_ttm = peMatch[1]; - - // Forward P/E - let forwardPeMatch = fullText.match(/Forward\s+P[\/\s]*E\s*(?:Value)?\s*([0-9.]+)/i); - if (!forwardPeMatch) forwardPeMatch = fullText.match(/P[\/\s]*E\s*\(Forward\)\s*(?:Value)?\s*([0-9.]+)/i); - if (forwardPeMatch) data.forward_pe = forwardPeMatch[1]; - - // PEG Ratio - let pegMatch = fullText.match(/Price\s+to\s+Earnings[\/\s]*Growth\s*\(PEG\)\s*(?:Value)?\s*([0-9.]+)/i); - if (!pegMatch) pegMatch = fullText.match(/PEG\s*Ratio?\s*(?:Value)?\s*([0-9.]+)/i); - if (pegMatch) data.peg_ratio = pegMatch[1]; - - // Recent beats/misses (simplified - just extract beat amounts) - const beatMatches = fullText.matchAll(/Beat.*?\$([0-9.]+)/gi); - data.recent_beats = []; - for (const match of beatMatches) { - data.recent_beats.push(match[1]); - } - - return data; - } - ''', debug) - - # Parse and assign values - earnings.next_announcement_date = earnings_data.get('next_announcement_date') - earnings.announcement_timing = earnings_data.get('announcement_timing') - earnings.analysts_covering = _parse_int(earnings_data.get('analysts_covering')) - earnings.consensus_estimate = _parse_float(earnings_data.get('consensus_estimate')) - earnings.estimate_high = _parse_float(earnings_data.get('estimate_high')) - earnings.estimate_low = _parse_float(earnings_data.get('estimate_low')) - earnings.eps_ttm = _parse_float(earnings_data.get('eps_ttm')) - earnings.revenue_ttm = _parse_revenue(earnings_data.get('revenue_ttm', '')) - earnings.pe_ttm = _parse_float(earnings_data.get('pe_ttm')) - earnings.forward_pe = _parse_float(earnings_data.get('forward_pe')) - earnings.peg_ratio = _parse_float(earnings_data.get('peg_ratio')) - - # Store recent beats as list of dicts - if earnings_data.get('recent_beats'): - earnings.recent_beats = [ - {'beat_amount': _parse_float(beat)} - for beat in earnings_data.get('recent_beats', []) - ] - - if debug: - logger.debug(f"Extracted earnings data: eps_ttm={earnings.eps_ttm}, " - f"pe_ttm={earnings.pe_ttm}, forward_pe={earnings.forward_pe}") - - except Exception as e: - if debug: - logger.debug(f"Error extracting earnings data: {e}") - - return earnings - - -def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]: - """Calculate dividend payout ratio. - - Formula: (Annual Dividend Rate / EPS TTM) × 100 - - Args: - annual_dividend: Annual dividend rate per share - eps_ttm: Earnings per share (trailing twelve months) - - Returns: - Payout ratio as percentage, or None if cannot calculate - """ - if annual_dividend and eps_ttm and eps_ttm > 0: - ratio = (annual_dividend / eps_ttm) * 100 - return round(ratio, 2) - return None - - -async def extract_phase1_data(page, debug: bool = False) -> EquityPhase1Data: - """Extract all Phase 1 data points. - - Args: - page: Playwright page object - debug: Enable debug output - - Returns: - EquityPhase1Data object with all extracted data - """ - if debug: - logger.debug("Starting Phase 1 data extraction...") - - # Wait for page to stabilize - await page.wait_for_timeout(3000) - - # Extract ticker from page URL - ticker = await page.evaluate(''' - () => { - const url = window.location.href; - const match = url.match(/stocks\\/([A-Z]+)/i); - return match ? match[1].toUpperCase() : ''; - } - ''') - - # Extract each section - quote = await extract_quote_data(page, ticker=ticker, debug=debug) - dividends = await extract_enhanced_dividends(page, debug=debug) - earnings = await extract_earnings_data(page, debug=debug) - - # Calculate derived metrics - calculated = CalculatedMetrics() - if dividends.annual_rate and earnings.eps_ttm: - calculated.payout_ratio = calculate_payout_ratio( - dividends.annual_rate, - earnings.eps_ttm - ) - - # Create Phase 1 data object - phase1_data = EquityPhase1Data( - ticker=ticker, - quote=quote, - dividends=dividends, - earnings=earnings, - calculated_metrics=calculated - ) - - if debug: - logger.debug(f"Phase 1 extraction complete for {ticker}") - - return phase1_data diff --git a/schwab_scraper/features/equity/scraper.py b/schwab_scraper/features/equity/scraper.py deleted file mode 100644 index d5afcb8..0000000 --- a/schwab_scraper/features/equity/scraper.py +++ /dev/null @@ -1,977 +0,0 @@ -from typing import Dict, Any, Optional -from ...utils.logging import save_debug_artifact - - -def should_replace_dividend_value(existing_value: Optional[str], new_value: Optional[str]) -> bool: - """ - Decide whether to replace an existing dividend field value with a new one. - - Rules: - - Never replace with empty/None values - - Replace if there is no existing value - - Replace if the existing value is "Show More" or contains "Show More" - - Otherwise, keep the existing (good) data - """ - if not new_value or not str(new_value).strip(): - return False - if not existing_value: - return True - existing_text = str(existing_value) - if existing_text == 'Show More' or 'Show More' in existing_text: - return True - return False - - -async def extract_dividend_data(page, debug: bool = False) -> Dict[str, Any]: - """ - Extract dividend information from Schwab stock page. - Returns dictionary with dividend data fields. - """ - dividend_data: Dict[str, Any] = {} - - try: - if debug: - print("DEBUG: Starting dividend data extraction...") - # Take initial screenshot to see page state - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_start.png", png) - print(f"DEBUG: Initial screenshot saved as {path}") - - # Wait for the dividends section to load dynamically - if debug: - print("DEBUG: Waiting for dividends section to load...") - - try: - # First wait for the dividends panel to appear - await page.wait_for_selector('#dividends', timeout=15000) - if debug: - print("DEBUG: #dividends panel found") - - # Wait for dividend content to load dynamically - dividend_loaded = False - max_attempts = 5 # Reduced from 10 for faster tests - attempt = 0 - - while not dividend_loaded and attempt < max_attempts: - attempt += 1 - if debug: - print(f"DEBUG: Attempt {attempt}/{max_attempts} - Waiting for dynamic dividend content...") - - # Check if the dividends section has been populated with actual content - dividend_status = await page.evaluate(''' - () => { - const result = { loaded: false, debug: {} }; - - // Look for the dividends panel content that should be populated - const dividendsPanel = document.querySelector('#dividends'); - if (dividendsPanel) { - const panelBody = dividendsPanel.querySelector('.sdps-panel__body'); - if (panelBody) { - const textContent = panelBody.textContent || ''; - result.debug.panelBodyLength = textContent.length; - result.debug.panelBodySample = textContent.substring(0, 200); - - // Check if the panel has been populated with actual dividend text - // (not just empty comments) - const hasRealContent = textContent.length > 50 && ( - textContent.includes('Previous Dividend') || - textContent.includes('Pay Date') || - textContent.includes('Ex-Date') || - textContent.includes('Frequency') || - textContent.includes('Annual Dividend') || - textContent.includes('$') || - textContent.includes('%') - ); - - if (hasRealContent) { - result.loaded = true; - return result; - } - } - } - - // Alternative: check for stock-dividends component - const stockDividends = document.querySelector('stock-dividends'); - if (stockDividends) { - const text = stockDividends.textContent || ''; - result.debug.stockDividendsLength = text.length; - result.debug.stockDividendsSample = text.substring(0, 100); - - if (text.length > 20 && text.includes('$')) { - result.loaded = true; - return result; - } - } - - // Alternative: check for any elements with dividend-related content - const allElements = document.querySelectorAll('#dividends *'); - result.debug.totalElements = allElements.length; - - for (let elem of allElements) { - const text = elem.textContent || ''; - if (text.includes('Previous Dividend Payment') || - (text.includes('$') && text.includes('.'))) { - result.loaded = true; - result.debug.foundInElement = elem.tagName + '.' + elem.className; - return result; - } - } - - return result; - } - ''') - - if debug: - print(f"DEBUG: Dividend status: {dividend_status}") - - dividend_loaded = dividend_status.get('loaded', False) - - if dividend_loaded: - if debug: - print("DEBUG: Dynamic dividend content loaded!") - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_content_loaded.png", png) - print(f"DEBUG: Screenshot after content loaded: {path}") - break - - # Wait between attempts to allow for async loading - await page.wait_for_timeout(1000) # Reduced from 2000ms for faster tests - - if not dividend_loaded: - if debug: - print("DEBUG: Basic dividend content did not auto-load - this suggests the page is not behaving as expected") - print("DEBUG: Expected behavior: Basic dividend info should be visible without clicking 'Show More'") - - # Try to force a page refresh or trigger loading - print("DEBUG: Attempting to trigger dividend content loading...") - try: - # Try scrolling to the dividend section to trigger lazy loading - await page.evaluate(''' - () => { - const dividendsPanel = document.querySelector('#dividends'); - if (dividendsPanel) { - dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' }); - } - } - ''') - await page.wait_for_timeout(3000) - - # Try clicking on the dividends panel header to ensure it's active - try: - dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title') - if dividends_header: - await dividends_header.click() - await page.wait_for_timeout(2000) - print("DEBUG: Clicked on dividends panel header") - except: - pass - - # Check one more time if content loaded - final_status = await page.evaluate(''' - () => { - const dividendsPanel = document.querySelector('#dividends'); - if (dividendsPanel) { - const panelBody = dividendsPanel.querySelector('.sdps-panel__body'); - if (panelBody) { - const textContent = panelBody.textContent || ''; - return { - length: textContent.length, - sample: textContent.substring(0, 500), - hasBasicData: textContent.includes('$') && ( - textContent.includes('Previous') || - textContent.includes('Pay Date') || - textContent.includes('Ex-Date') - ) - }; - } - } - return { length: 0, sample: '', hasBasicData: false }; - } - ''') - - if debug: - print(f"DEBUG: Final dividend panel status: {final_status}") - - if final_status.get('hasBasicData'): - print("DEBUG: Basic dividend data now detected after manual triggering!") - dividend_loaded = True - - # Extract the data immediately while it's loaded - immediate_extraction = await page.evaluate(r''' - () => { - const results = {}; - const dividendsPanel = document.querySelector('#dividends'); - - if (dividendsPanel) { - const panelBody = dividendsPanel.querySelector('.sdps-panel__body'); - if (panelBody) { - const fullText = panelBody.textContent || ''; - - // Extract data using pattern matching from the full text - const patterns = { - 'Previous Dividend Payment': /Previous Dividend Payment\s*\$([0-9]+\.[0-9]+)/, - 'Previous Pay Date': /Previous Pay Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/, - 'Previous Ex-Date': /Previous Ex-Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/, - 'Frequency': /Frequency\s*([A-Za-z]+)/, - 'Annual Dividend Rate': /(?:Annual Dividend Rate|IAD).*?\$([0-9]+\.[0-9]+)/, - 'Annual Dividend Yield': /([0-9]+\.[0-9]+%)(?=\s|Annual|$)/ - }; - - for (const [field, pattern] of Object.entries(patterns)) { - const match = fullText.match(pattern); - if (match) { - if (field === 'Previous Dividend Payment' || field === 'Annual Dividend Rate') { - results[field] = '$' + match[1]; - } else { - results[field] = match[1]; - } - } - } - } - } - - return results; - } - ''') - - if debug: - print(f"DEBUG: Immediate extraction results: {immediate_extraction}") - - if immediate_extraction: - dividend_data.update(immediate_extraction) - # Clean up the Frequency field if it has extra text - if 'Frequency' in dividend_data and 'Quarterly' in dividend_data['Frequency']: - dividend_data['Frequency'] = 'Quarterly' - - except Exception as e: - if debug: - print(f"DEBUG: Error during manual triggering: {e}") - - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_timeout.png", png) - print(f"DEBUG: Screenshot after timeout: {path}") - - except Exception as e: - if debug: - print(f"DEBUG: Error waiting for dividend content: {e}") - - # Check for dividend grid directly without clicking - if debug: - print("DEBUG: Checking for #dividend-grid...") - - dividend_grid_found = False - try: - await page.wait_for_selector('#dividend-grid', timeout=10000) - dividend_grid_found = True - if debug: - print("DEBUG: #dividend-grid found!") - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_grid_found.png", png) - print(f"DEBUG: Screenshot with dividend grid: {path}") - except: - if debug: - print("DEBUG: #dividend-grid not found initially") - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_no_grid.png", png) - print(f"DEBUG: Screenshot without grid: {path}") - - # Try to scroll to the dividend section to ensure it's in view - if debug: - print("DEBUG: Scrolling to stock-dividends component...") - - try: - await page.evaluate(''' - () => { - const stockDividends = document.querySelector('stock-dividends'); - if (stockDividends) { - stockDividends.scrollIntoView({ behavior: 'smooth', block: 'center' }); - } - } - ''') - await page.wait_for_timeout(3000) - - if debug: - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_after_scroll.png", png) - print(f"DEBUG: Screenshot after scroll: {path}") - - # Check again for dividend grid after scrolling - try: - await page.wait_for_selector('#dividend-grid', timeout=5000) - dividend_grid_found = True - if debug: - print("DEBUG: #dividend-grid found after scroll!") - png = await page.screenshot(full_page=True) - path = save_debug_artifact("debug_dividend_grid_after_scroll.png", png) - print(f"DEBUG: Screenshot with grid after scroll: {path}") - except: - if debug: - print("DEBUG: #dividend-grid still not found after scroll") - - except Exception as e: - if debug: - print(f"DEBUG: Error during scroll attempt: {e}") - - # Common dividend section selectors used by financial websites - dividend_selectors = [ - '#dividend-grid', # Primary target based on user feedback - 'stock-dividends', # Secondary target - the web component - '#dividend-section', - '#dividends-section', - '.dividend-summary', - '.dividends-summary', - 'div[data-testid*="dividend"]', - 'div[aria-label*="dividend"]', - '[class*="dividend"]', - 'section:has-text("Dividend")', - 'div:has-text("Previous Dividend Payment")' - ] - - # Try to find dividend section - dividend_section = None - for selector in dividend_selectors: - try: - if await page.is_visible(selector): - dividend_section = selector - if debug: - print(f"DEBUG: Found dividend section with selector: {selector}") - break - except: - continue - - if not dividend_section: - if debug: - print("DEBUG: No dividend section found, trying broader search...") - - # In debug mode, capture the page content to help identify selectors - page_content = await page.content() - path_html = save_debug_artifact("debug_dividend_page.html", page_content) - print(f"DEBUG: Page HTML saved to {path_html} for analysis") - - # Also save a screenshot to see the visual layout - png = await page.screenshot(full_page=True) - path_png = save_debug_artifact("debug_dividend_page.png", png) - print(f"DEBUG: Page screenshot saved to {path_png}") - - # Fallback: look for dividend-related text anywhere on page - dividend_text_exists = await page.evaluate(''' - () => { - const text = document.body.innerText.toLowerCase(); - return text.includes('dividend') || text.includes('ex-date') || text.includes('pay date') || text.includes('previous dividend') || text.includes('iad'); - } - ''') - - if debug: - print(f"DEBUG: Dividend-related text found on page: {dividend_text_exists}") - - # Try scrolling down to reveal more content - await page.evaluate('window.scrollTo(0, document.body.scrollHeight)') - await page.wait_for_timeout(2000) - - # Extract all text content that might contain dividend info - dividend_related_text = await page.evaluate(''' - () => { - const text = document.body.innerText; - const lines = text.split('\n'); - const dividendLines = lines.filter(line => { - const lower = line.toLowerCase(); - return lower.includes('dividend') || lower.includes('ex-date') || - lower.includes('pay date') || lower.includes('previous') || - lower.includes('iad') || lower.includes('frequency') || - lower.includes('quarterly') || lower.includes('$0.26') || - lower.includes('0.4865%') || lower.includes('$1.04') || - lower.includes('annual dividend') || lower.includes('yield'); - }); - return dividendLines; - } - ''') - print(f"DEBUG: Found dividend-related text lines: {dividend_related_text}") - - # Try a more comprehensive search for dividend data - all_dividend_info = await page.evaluate(''' - () => { - // Look for elements containing common dividend field names - const fieldNames = [ - 'Previous Dividend Payment', 'Next Dividend Payment', - 'Previous Pay Date', 'Next Pay Date', - 'Previous Ex-Date', 'Next Ex-Date', 'Ex-Date', - 'Frequency', 'Annual Dividend Rate', 'IAD', - 'Annual Dividend Yield', 'Dividend Yield' - ]; - - const results = {}; - - fieldNames.forEach(fieldName => { - // Search for elements containing this field name - const elements = Array.from(document.querySelectorAll('*')).filter(el => - el.textContent && el.textContent.includes(fieldName) && - el.children.length === 0 // Text nodes only - ); - - elements.forEach(el => { - // Look for value in nearby elements - const parent = el.parentElement; - if (parent) { - const siblings = Array.from(parent.children); - const currentIndex = siblings.indexOf(el); - - // Check next siblings for values - for (let i = currentIndex + 1; i < siblings.length; i++) { - const sibling = siblings[i]; - const text = sibling.textContent.trim(); - if (text && text !== fieldName && text.length > 0 && text.length < 50) { - results[fieldName] = text; - break; - } - } - - // Check same element for values after the field name - const fullText = el.textContent; - const fieldIndex = fullText.indexOf(fieldName); - if (fieldIndex >= 0) { - const afterField = fullText.substring(fieldIndex + fieldName.length).trim(); - if (afterField && afterField.length > 0 && afterField.length < 50) { - results[fieldName] = afterField; - } - } - } - }); - }); - - return results; - } - ''') - print(f"DEBUG: Comprehensive dividend search results: {all_dividend_info}") - - # If we found data in the comprehensive search, use it only if we don't already have good data - if all_dividend_info: - for field, value in all_dividend_info.items(): - if value and value.strip(): - existing_value = dividend_data.get(field, '') - if should_replace_dividend_value(existing_value, value): - dividend_data[field] = value.strip() - if debug: - print(f"DEBUG: Added dividend field from comprehensive search: {field} = {value}") - elif debug: - print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring comprehensive search value: {value})") - - if not dividend_text_exists: - if debug: - print("DEBUG: No dividend-related content found on page") - return dividend_data - - # Use body as fallback section for broad search - dividend_section = 'body' - if debug: - print("DEBUG: Using body as dividend section for broad search") - - # If we found the dividend grid, use specific selectors based on user feedback - if dividend_section == '#dividend-grid': - if debug: - print("DEBUG: Using specific dividend grid selectors...") - - try: - # First check if dividend grid is actually present and populated - grid_status = await page.evaluate(''' - () => { - const dividendGrid = document.querySelector('#dividend-grid'); - if (!dividendGrid) return { found: false, message: 'No #dividend-grid element found' }; - - const textContent = dividendGrid.textContent || ''; - const hasContent = textContent.trim().length > 50; - const childCount = dividendGrid.children.length; - - return { - found: true, - hasContent, - textLength: textContent.length, - childCount, - preview: textContent.substring(0, 200), - message: `Grid found with ${childCount} children, ${textContent.length} chars` - }; - } - ''') - - if debug: - print(f"DEBUG: Dividend grid status: {grid_status}") - - # Extract dividend data using improved selectors - specific_dividend_data = await page.evaluate(r''' - () => { - const results = {}; - - // Check if dividend grid exists and has content - const dividendGrid = document.querySelector('#dividend-grid'); - if (dividendGrid) { - const allGridText = dividendGrid.textContent || ''; - const lines = allGridText.split('\n').map(line => line.trim()).filter(line => line.length > 0); - - // Try structured approach first - look for rows/cells - const dividendRows = dividendGrid.querySelectorAll('div[class*="row"], tr, .dividend-row, div:has(div)'); - dividendRows.forEach((row, rowIndex) => { - const rowText = row.textContent || ''; - - // Look for dividend payment info - if (rowText.includes('Dividend Payment') || (rowText.includes('Previous') && rowText.includes('$'))) { - const amountMatch = rowText.match(/\$[0-9]+\.[0-9]+/); - if (amountMatch && !results['Previous Dividend Payment']) { - results['Previous Dividend Payment'] = amountMatch[0]; - } - - // Look for dates in the same row - const dateMatches = rowText.match(/([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/g); - if (dateMatches) { - if (dateMatches.length >= 1 && !results['Previous Pay Date']) results['Previous Pay Date'] = dateMatches[0]; - if (dateMatches.length >= 2 && !results['Previous Ex-Date']) results['Previous Ex-Date'] = dateMatches[1]; - } - } - }); - - // Fallback: Parse all lines systematically - for (let i = 0; i < lines.length; i++) { - const line = lines[i]; - const nextLine = i + 1 < lines.length ? lines[i + 1] : ''; - - // Match dividend payment - if ((line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) && !results['Previous Dividend Payment']) { - const amountPattern = /\$[0-9]+\.[0-9]+/; - let amount = line.match(amountPattern) || nextLine.match(amountPattern); - if (amount) results['Previous Dividend Payment'] = amount[0]; - } - - // Match pay date - if (line.includes('Pay Date') && !results['Previous Pay Date']) { - const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/; - let date = line.match(datePattern) || nextLine.match(datePattern); - if (date) results['Previous Pay Date'] = date[0]; - } - - // Match ex-date - if (line.includes('Ex-Date') && !results['Previous Ex-Date']) { - const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/; - let date = line.match(datePattern) || nextLine.match(datePattern); - if (date) results['Previous Ex-Date'] = date[0]; - } - - // Match frequency - if (line.includes('Frequency') && !results['Frequency']) { - const freqLine = line + ' ' + nextLine; - if (freqLine.toLowerCase().includes('quarterly')) results['Frequency'] = 'Quarterly'; - else if (freqLine.toLowerCase().includes('monthly')) results['Frequency'] = 'Monthly'; - else if (freqLine.toLowerCase().includes('annual')) results['Frequency'] = 'Annual'; - else if (freqLine.toLowerCase().includes('semi')) results['Frequency'] = 'Semi-Annual'; - } - - // Match annual dividend rate - if ((line.includes('Annual Dividend Rate') || line.includes('IAD')) && !results['Annual Dividend Rate']) { - const amountPattern = /\$[0-9]+\.[0-9]+/; - let amount = line.match(amountPattern) || nextLine.match(amountPattern); - if (amount) results['Annual Dividend Rate'] = amount[0]; - } - - // Match annual dividend yield - if (line.includes('Annual Dividend Yield') && !results['Annual Dividend Yield']) { - const percentPattern = /[0-9]+\.[0-9]+%/; - let percent = line.match(percentPattern) || nextLine.match(percentPattern); - if (percent) results['Annual Dividend Yield'] = percent[0]; - } - } - } - - return results; - } - ''') - - if debug: - print(f"DEBUG: Specific dividend grid extraction results: {specific_dividend_data}") - - # Add the extracted data to dividend_data only if we don't already have good data - if specific_dividend_data: - for field, value in specific_dividend_data.items(): - existing_value = dividend_data.get(field, '') - if should_replace_dividend_value(existing_value, value): - dividend_data[field] = value - if debug: - print(f"DEBUG: Updated {field} from specific extraction: {value}") - elif debug: - print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring specific extraction value: {value})") - - except Exception as e: - if debug: - print(f"DEBUG: Error in specific dividend grid extraction: {e}") - - # Extract dividend data using the correct structure from gemini analysis - if debug: - print("DEBUG: Extracting dividend data from dividend-grid structure...") - - # First try to extract data from the dynamically loaded dividend content - try: - dividend_dynamic_data = await page.evaluate(r''' - () => { - const results = {}; - - // Strategy 1: Look for any dividend grid structure that was loaded - const dividendGrid = document.querySelector('#dividend-grid'); - if (dividendGrid) { - const rows = dividendGrid.querySelectorAll('div.sdps-row, .row'); - - for (let row of rows) { - const cells = row.querySelectorAll('div[class*="col-"]'); - if (cells.length >= 2) { - const label = cells[0].textContent.trim(); - const value = cells[1].textContent.trim(); - - // Map the labels to our expected field names - if (label.includes('Previous Dividend Payment') || label.includes('Dividend Payment')) { - results['Previous Dividend Payment'] = value; - } else if (label.includes('Previous Pay Date') || label.includes('Pay Date')) { - results['Previous Pay Date'] = value; - } else if (label.includes('Previous Ex-Date') || label.includes('Ex-Date')) { - results['Previous Ex-Date'] = value; - } else if (label.includes('Frequency')) { - results['Frequency'] = value; - } else if (label.includes('Annual Dividend Rate') || label.includes('IAD')) { - results['Annual Dividend Rate'] = value; - } else if (label.includes('Annual Dividend Yield')) { - results['Annual Dividend Yield'] = value; - } - } - } - - if (Object.keys(results).length > 0) { - return results; - } - } - - // Strategy 2: Look for stock-dividends component content - const stockDividends = document.querySelector('stock-dividends'); - if (stockDividends) { - const allText = stockDividends.textContent || ''; - const lines = allText.split('\n').map(line => line.trim()).filter(line => line); - - for (let i = 0; i < lines.length; i++) { - const line = lines[i]; - const nextLine = i + 1 < lines.length ? lines[i + 1] : ''; - - if (line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) { - const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/); - if (amountMatch) results['Previous Dividend Payment'] = amountMatch[0]; - } else if (line.includes('Pay Date')) { - const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/); - if (dateMatch) results['Previous Pay Date'] = dateMatch[0]; - } else if (line.includes('Ex-Date')) { - const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/); - if (dateMatch) results['Previous Ex-Date'] = dateMatch[0]; - } else if (line.includes('Frequency')) { - if (line.toLowerCase().includes('quarterly') || nextLine.toLowerCase().includes('quarterly')) { - results['Frequency'] = 'Quarterly'; - } else if (line.toLowerCase().includes('monthly') || nextLine.toLowerCase().includes('monthly')) { - results['Frequency'] = 'Monthly'; - } else if (line.toLowerCase().includes('annual') || nextLine.toLowerCase().includes('annual')) { - results['Frequency'] = 'Annual'; - } - } else if (line.includes('Annual Dividend Rate') || line.includes('IAD')) { - const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/); - if (amountMatch) results['Annual Dividend Rate'] = amountMatch[0]; - } else if (line.includes('Annual Dividend Yield')) { - const percentMatch = (line + ' ' + nextLine).match(/[0-9]+\.[0-9]+%/); - if (percentMatch) results['Annual Dividend Yield'] = percentMatch[0]; - } - } - - if (Object.keys(results).length > 0) { - return results; - } - } - - // Strategy 3: Look within entire dividends panel for any structured content - const dividendsPanel = document.querySelector('#dividends'); - if (dividendsPanel) { - const allElements = dividendsPanel.querySelectorAll('*'); - - for (let elem of allElements) { - const text = elem.textContent || ''; - - // Look for dollar amounts near dividend-related text - if (text.includes('Previous Dividend Payment') || text.includes('Dividend Payment')) { - const parent = elem.parentElement; - if (parent) { - const siblings = Array.from(parent.children); - const currentIndex = siblings.indexOf(elem); - - // Check next siblings for values - for (let j = currentIndex + 1; j < siblings.length; j++) { - const sibling = siblings[j]; - const siblingText = sibling.textContent.trim(); - const amountMatch = siblingText.match(/\$[0-9]+\.[0-9]+/); - if (amountMatch) { - results['Previous Dividend Payment'] = amountMatch[0]; - break; - } - } - } - } - - // Similar logic for other fields... - // (truncated for brevity but would include Pay Date, Ex-Date, etc.) - } - } - - return results; - } - ''') - - if debug: - print(f"DEBUG: Dynamic dividend extraction results: {dividend_dynamic_data}") - - if dividend_dynamic_data: - for field, value in dividend_dynamic_data.items(): - existing_value = dividend_data.get(field, '') - if should_replace_dividend_value(existing_value, value): - dividend_data[field] = value - if debug: - print(f"DEBUG: Updated {field} from dynamic extraction: {value}") - elif debug: - print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring dynamic extraction value: {value})") - - except Exception as e: - if debug: - print(f"DEBUG: Error in dynamic dividend extraction: {e}") - - # Define dividend fields and their possible selectors as fallback - dividend_fields = { - 'Previous Dividend Payment': [ - '#dividend-grid div:has-text("Previous Dividend Payment") ~ div', - '#dividend-grid div:has-text("Dividend Payment") ~ div', - '#dividends span:has-text("Previous Dividend Payment") + span', - '#dividends div:has-text("Previous Dividend Payment") + div', - '#dividends *:has-text("Previous Dividend Payment") ~ *', - 'stock-dividends span:has-text("Previous Dividend Payment") + span', - 'stock-dividends div:has-text("Previous Dividend Payment") + div', - 'span:has-text("Previous Dividend Payment") + span', - 'div:has-text("Previous Dividend Payment") + div', - '*:has-text("Previous Dividend Payment") ~ *', - 'span:has-text("Next Dividend Payment") + span', - 'div:has-text("Next Dividend Payment") + div', - '*:has-text("Next Dividend Payment") ~ *', - '[data-field="dividend-payment"]', - '.dividend-payment' - ], - 'Previous Pay Date': [ - '#dividend-grid div:has-text("Previous Pay Date") ~ div', - '#dividend-grid div:has-text("Pay Date") ~ div', - '#dividends span:has-text("Previous Pay Date") + span', - '#dividends div:has-text("Previous Pay Date") + div', - '#dividends *:has-text("Previous Pay Date") ~ *', - 'stock-dividends span:has-text("Previous Pay Date") + span', - 'stock-dividends div:has-text("Previous Pay Date") + div', - 'span:has-text("Previous Pay Date") + span', - 'div:has-text("Previous Pay Date") + div', - '*:has-text("Previous Pay Date") ~ *', - 'span:has-text("Next Pay Date") + span', - 'div:has-text("Next Pay Date") + div', - '*:has-text("Next Pay Date") ~ *', - '*:has-text("Pay Date") ~ *', - '[data-field="pay-date"]', - '.pay-date' - ], - 'Previous Ex-Date': [ - '#dividend-grid div:has-text("Previous Ex-Date") ~ div', - '#dividend-grid div:has-text("Ex-Date") ~ div', - '#dividends span:has-text("Previous Ex-Date") + span', - '#dividends div:has-text("Previous Ex-Date") + div', - '#dividends *:has-text("Previous Ex-Date") ~ *', - 'stock-dividends span:has-text("Previous Ex-Date") + span', - 'stock-dividends div:has-text("Previous Ex-Date") + div', - 'span:has-text("Previous Ex-Date") + span', - 'div:has-text("Previous Ex-Date") + div', - '*:has-text("Previous Ex-Date") ~ *', - 'span:has-text("Next Ex-Date") + span', - 'div:has-text("Next Ex-Date") + div', - '*:has-text("Next Ex-Date") ~ *', - '*:has-text("Ex-Date") ~ *', - '[data-field="ex-date"]', - '.ex-date' - ], - 'Frequency': [ - '#dividend-grid div:has-text("Frequency") ~ div', - '#dividends span:has-text("Frequency") + span', - '#dividends div:has-text("Frequency") + div', - '#dividends *:has-text("Frequency") ~ *', - 'stock-dividends span:has-text("Frequency") + span', - 'stock-dividends div:has-text("Frequency") + div', - 'span:has-text("Frequency") + span', - 'div:has-text("Frequency") + div', - '*:has-text("Frequency") ~ *', - '[data-field="frequency"]', - '.dividend-frequency', - '.frequency' - ], - 'Annual Dividend Rate': [ - '#dividend-grid div:has-text("Annual Dividend Rate") ~ div', - '#dividend-grid div:has-text("IAD") ~ div', - '#dividends span:has-text("Annual Dividend Rate") + span', - '#dividends div:has-text("Annual Dividend Rate") + div', - '#dividends *:has-text("Annual Dividend Rate") ~ *', - '#dividends span:has-text("IAD") + span', - '#dividends *:has-text("IAD") ~ *', - 'stock-dividends span:has-text("Annual Dividend Rate") + span', - 'stock-dividends div:has-text("Annual Dividend Rate") + div', - 'stock-dividends span:has-text("IAD") + span', - 'span:has-text("Annual Dividend Rate") + span', - 'div:has-text("Annual Dividend Rate") + div', - '*:has-text("Annual Dividend Rate") ~ *', - 'span:has-text("IAD") + span', - '*:has-text("IAD") ~ *', - '[data-field="annual-rate"]', - '.annual-dividend-rate' - ], - 'Annual Dividend Yield': [ - '#dividend-grid div:has-text("Annual Dividend Yield") ~ div', - '#dividends span:has-text("Annual Dividend Yield") + span', - '#dividends div:has-text("Annual Dividend Yield") + div', - '#dividends *:has-text("Annual Dividend Yield") ~ *', - 'stock-dividends span:has-text("Annual Dividend Yield") + span', - 'stock-dividends div:has-text("Annual Dividend Yield") + div', - 'span:has-text("Annual Dividend Yield") + span', - 'div:has-text("Annual Dividend Yield") + div', - '*:has-text("Annual Dividend Yield") ~ *', - '[data-field="dividend-yield"]', - '.dividend-yield' - ] - } - - # Extract each dividend field using multiple selector strategies - for field_name, selectors in dividend_fields.items(): - field_found = False - - # Try each selector for this field - for selector in selectors: - if field_found: - break - - try: - # Scope search within dividend section if found, otherwise search whole page - full_selector = f'{dividend_section} {selector}' if dividend_section != 'body' else selector - - if await page.is_visible(full_selector, timeout=1000): - value = await page.inner_text(full_selector) - clean_value = value.strip() - - if clean_value and clean_value != field_name: # Ensure we got actual value, not the label - existing_value = dividend_data.get(field_name, '') - if should_replace_dividend_value(existing_value, clean_value): - dividend_data[field_name] = clean_value - field_found = True - if debug: - print(f"DEBUG: Found {field_name}: {clean_value} (selector: {full_selector})") - elif debug: - print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring selector-based value: {clean_value})") - break - except: - continue - - # If standard selectors failed, try JavaScript-based text search as fallback - if not field_found: - try: - # Try multiple variations of the field name - search_terms = [field_name] - if "Previous" in field_name: - search_terms.append(field_name.replace("Previous", "Next")) - if "Annual Dividend Rate" in field_name: - search_terms.append("IAD") - if "Annual Dividend Yield" in field_name: - search_terms.append("Dividend Yield") - - for search_term in search_terms: - if field_found: - break - - value = await page.evaluate(rf''' - () => {{ - const searchText = "{search_term}"; - - // First check within the dividends section specifically - const dividendsPanel = document.querySelector('#dividends'); - const stockDividends = document.querySelector('stock-dividends'); - const searchContainers = [dividendsPanel, stockDividends, document]; - - for (let container of searchContainers) {{ - if (!container) continue; - - const elements = Array.from(container.querySelectorAll('*')); - - for (let elem of elements) {{ - if (elem.textContent && elem.textContent.includes(searchText)) {{ - // Look for next sibling or nearby element with value - let candidate = elem.nextElementSibling; - if (candidate && candidate.textContent && - !candidate.textContent.includes(searchText) && - candidate.textContent.trim().length > 0) {{ - return candidate.textContent.trim(); - }} - - // Try parent's next sibling - candidate = elem.parentElement?.nextElementSibling; - if (candidate && candidate.textContent && - !candidate.textContent.includes(searchText) && - candidate.textContent.trim().length > 0) {{ - return candidate.textContent.trim(); - }} - - // Try looking in the same element's parent for nearby text - const parent = elem.parentElement; - if (parent) {{ - const parentText = parent.textContent; - const lines = parentText.split('\n'); - for (let i = 0; i < lines.length; i++) {{ - if (lines[i].includes(searchText) && i + 1 < lines.length) {{ - const nextLine = lines[i + 1].trim(); - if (nextLine && !nextLine.includes(searchText)) {{ - return nextLine; - }} - }} - }} - }} - }} - }} - - // If found in this container, stop searching - if (container !== document) {{ - break; - }} - }} - return null; - }} - ''') - - if value and value.strip(): - existing_value = dividend_data.get(field_name, '') - if should_replace_dividend_value(existing_value, value): - dividend_data[field_name] = value.strip() - field_found = True - if debug: - print(f"DEBUG: Found {field_name} via JS search with term '{search_term}': {value}") - elif debug: - print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring JS search value: {value})") - break - - except Exception as e: - if debug: - print(f"DEBUG: Could not find {field_name}: {e}") - continue - - if debug: - print(f"DEBUG: Extracted dividend data: {dividend_data}") - - return dividend_data - - except Exception as e: - if debug: - print(f"DEBUG: Error extracting dividend data: {e}") - return dividend_data - - -async def extract(page, debug: bool = False) -> Dict[str, Any]: - """Compatibility wrapper to call `extract_dividend_data`""" - return await extract_dividend_data(page, debug=debug) diff --git a/schwab_scraper/features/equity/service.py b/schwab_scraper/features/equity/service.py deleted file mode 100644 index 537d804..0000000 --- a/schwab_scraper/features/equity/service.py +++ /dev/null @@ -1,452 +0,0 @@ -import time -from typing import Any, Dict, Optional -import logging -from ...core.config import load_config, get_playwright_url -from ...browser.auth import ensure_cookies -from ...browser.client import connect, new_context, new_page -from ...browser.navigation import goto_with_auth_check -from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok -from .morningstar import find_report, download_report_as_bytes -from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf -from .parser import parse as parse_pdf -from .scraper import extract_dividend_data -from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach -import re - -def extract_company_name_from_title(page_title: str, ticker: str): - if not page_title: - return None - try: - title = ( - page_title.replace(" | Charles Schwab", "") - .replace(" - Charles Schwab", "") - .replace("Stock Quote & Summary", "") - .replace("Stock Research", "") - .replace("Research", "") - .replace("- Research", "") - ) - pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)" - match = re.match(pattern, title, re.IGNORECASE) - if match: - company_name = match.group(1).strip() - company_name = company_name.replace(" -", "").strip() - if len(company_name) > 1 and not company_name.isdigit(): - return company_name - for separator in [" |", " -"]: - if separator in title: - potential_name = title.split(separator)[0].strip() - if potential_name.upper() != ticker.upper() and len(potential_name) > 1: - return potential_name - return None - except Exception: - return None - - -async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]: - """Get Phase 1 enhanced equity data for a ticker. - - Extracts: - - Quote/Price Data (symbol bar) - - Enhanced Dividend Information (forward-looking dates) - - Core Earnings Metrics (EPS, forecasts) - - Basic Valuation Ratios (P/E, Forward P/E, PEG) - - Calculated Metrics (payout ratio) - - Args: - ticker: Stock ticker symbol - debug: Enable debug logging - - Returns: - Envelope containing EquityPhase1Data or error - """ - ticker = ticker.upper() - logger = logging.getLogger(__name__) - if debug: - logger.setLevel(logging.DEBUG) - logger.debug(f"Starting get_equity_phase1_data for {ticker}") - - # Session management - cookies = await ensure_cookies() - if not cookies: - return fail( - "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.", - ErrorType.AUTHENTICATION, - retryable=False, - ) - - config = load_config() - playwright_url = get_playwright_url(config) - - # Browser orchestration - context = None - page = None - p, browser = await connect(playwright_url) - try: - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - # Navigate to stock research page - timeout = 30000 if debug else 45000 - success = await goto_with_auth_check( - page, - context, - f"https://client.schwab.com/app/research/#/stocks/{ticker}", - debug=debug, - timeout=timeout, - ) - if not success: - return fail( - "Authentication failed while navigating to research page", - ErrorType.AUTHENTICATION, - retryable=True, - ) - - # Validate ticker by checking for stock page content - if debug: - logger.debug(f"Current page URL: {page.url}") - - try: - # Wait for stock-specific content to appear - await page.wait_for_selector( - 'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section', - timeout=10000, - state='visible' - ) - except Exception as wait_err: - if debug: - logger.debug(f"Timeout waiting for stock content: {wait_err}") - return fail( - f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", - ErrorType.VALIDATION, - retryable=False, - ) - - # Validate content - try: - has_valid_content = await page.evaluate(''' - () => { - const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)'); - if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) { - return true; - } - const morningstarSection = document.querySelector('#morningstar-section'); - if (morningstarSection) { - return true; - } - return false; - } - ''') - - if not has_valid_content: - return fail( - f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", - ErrorType.VALIDATION, - retryable=False, - ) - except Exception as e: - logger.debug(f"Error checking for valid content: {e}") - return fail( - f"Invalid ticker: {ticker}. Unable to validate ticker.", - ErrorType.VALIDATION, - retryable=False, - ) - - # Extract Phase 1 data using improved DOM scraping - # Note: API approach failed due to CORS restrictions - phase1_data = await extract_phase1_data(page, debug=debug) - - return ok(phase1_data) - - finally: - try: - if page is not None: - await page.close() - except Exception: - pass - try: - if context is not None: - await context.close() - except Exception: - pass - for handle in (browser,): - try: - if handle is not None: - await handle.close() - except Exception: - pass - try: - if p is not None: - await p.stop() - except Exception: - pass - - -async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]: - ticker = ticker.upper() - ensure_cache_dir() - logger = logging.getLogger(__name__) - if debug: - logger.setLevel(logging.DEBUG) - logger.debug(f"Starting get_morningstar_data for {ticker}") - - # Session management - cookies = await ensure_cookies() - if not cookies: - return fail( - "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.", - ErrorType.AUTHENTICATION, - retryable=False, - ) - - config = load_config() - playwright_url = get_playwright_url(config) - - # Browser orchestration - context = None - page = None - p, browser = await connect(playwright_url) - try: - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - # Use shared auth-aware navigation helper for consistency - # Use shorter timeout for tests to speed up execution - timeout = 30000 if debug else 45000 - success = await goto_with_auth_check( - page, - context, - f"https://client.schwab.com/app/research/#/stocks/{ticker}", - debug=debug, - timeout=timeout, - ) - if not success: - return fail( - "Authentication failed while navigating to research page", - ErrorType.AUTHENTICATION, - retryable=True, - ) - - # Validate ticker by checking for stock page content - # Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid - if debug: - logger.debug(f"Current page URL: {page.url}") - - # Wait for page content to load - Schwab's research page loads asynchronously - # Give it time to populate the DOM before validation - try: - # Wait for either company name or Morningstar section to appear - # This indicates the page has loaded stock-specific content - await page.wait_for_selector( - 'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section', - timeout=10000, - state='visible' - ) - except Exception as wait_err: - # If neither selector appears after 10 seconds, likely an invalid ticker - if debug: - logger.debug(f"Timeout waiting for stock content: {wait_err}") - return fail( - f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", - ErrorType.VALIDATION, - retryable=False, - ) - - # Additional validation: check if we have valid stock page content - try: - has_valid_content = await page.evaluate(''' - () => { - // Look for company name span (valid stock pages have this) - const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)'); - if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) { - return true; - } - - // Look for Morningstar section (valid stock pages have this) - const morningstarSection = document.querySelector('#morningstar-section'); - if (morningstarSection) { - return true; - } - - // Look for company profile description (valid stock pages have this) - const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout'); - if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) { - return true; - } - - // Look for any stock-related content - const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]'); - if (stockContent) { - return true; - } - - return false; - } - ''') - - if debug: - logger.debug(f"Valid stock content detected: {has_valid_content}") - - if not has_valid_content: - if debug: - logger.debug(f"Invalid ticker detected - no stock content found") - return fail( - f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", - ErrorType.VALIDATION, - retryable=False, - ) - except Exception as e: - logger.debug(f"Error checking for valid content: {e}") - # If we can't check, assume invalid and return error - return fail( - f"Invalid ticker: {ticker}. Unable to validate ticker.", - ErrorType.VALIDATION, - retryable=False, - ) - - # Company name - extract from page elements - company_name = None - try: - # Strategy 1: Extract from company name span element - company_name = await page.evaluate(''' - () => { - // Look for company name in title span - const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)'); - if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) { - return nameSpan.textContent.trim(); - } - - // Fallback: Extract from company profile description - const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout'); - if (profileText && profileText.textContent) { - const text = profileText.textContent.trim(); - // Extract company name before " designs" or " is" or " provides" - const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i); - if (match) { - return match[1].trim(); - } - } - - return null; - } - ''') - if debug and company_name: - logger.debug(f"Extracted company name: {company_name}") - except Exception as e: - logger.debug(f"Company name extraction error: {e}") - - # Morningstar section wait - try: - await page.wait_for_selector('#morningstar-section', timeout=30000) - except Exception: - logger.debug("#morningstar-section not found within timeout") - - # Dividends - try: - dividend_data = await extract_dividend_data(page, debug=debug) - except Exception as exc: - logger.debug(f"Dividend extraction error: {exc}") - dividend_data = {} - - # Find report and download/cache - report_url, report_date = await find_report(page, debug=debug) - data: Dict[str, Any] = {} - if report_date: - data["Morningstar Equity Report Date"] = report_date.strip() - if report_url: - # Only store actual URL, not the __CLICK_TO_OPEN__ marker - if report_url != '__CLICK_TO_OPEN__': - data["Morningstar Equity Report URL"] = report_url - pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug) - else: - pdf_bytes = None - - parsed_data: Dict[str, Any] = {} - if pdf_bytes: - if report_date: - from datetime import datetime - try: - dt = datetime.strptime(report_date, "%b %d, %Y") - formatted_date = dt.strftime("%m-%d-%Y") - except Exception: - formatted_date = report_date.replace(" ", "-") - else: - formatted_date = time.strftime("%m-%d-%Y") - write_cached_pdf(ticker, formatted_date, pdf_bytes) - try: - parsed_data = parse_pdf(pdf_bytes) - parsed_data["source"] = "live" - except Exception as exc: - logger.debug(f"PDF parsing failed: {exc}") - parsed_data = {"error": "Failed to parse Morningstar report"} - else: - cached = read_cached_pdf(ticker) - if cached: - try: - parsed_data = parse_pdf(cached) - parsed_data["source"] = "cache" - except Exception as exc: - logger.debug(f"Cached PDF parsing failed: {exc}") - parsed_data = {"error": "Failed to parse cached Morningstar report"} - else: - parsed_data = {"error": f"Failed to download and no cache available for {ticker}"} - - morningstar = MorningstarData( - ticker=ticker, - company_name=company_name, - previous_dividend_payment=dividend_data.get("Previous Dividend Payment"), - previous_pay_date=dividend_data.get("Previous Pay Date"), - previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"), - frequency=dividend_data.get("Frequency"), - annual_dividend_rate=dividend_data.get("Annual Dividend Rate"), - annual_dividend_yield=dividend_data.get("Annual Dividend Yield"), - fair_value=parsed_data.get("Fair Value"), - economic_moat=parsed_data.get("Economic Moat"), - capital_allocation=parsed_data.get("Capital Allocation"), - rating=_safe_int(parsed_data.get("Morningstar Rating")), - one_star_price=parsed_data.get("1-Star Price"), - five_star_price=parsed_data.get("5-Star Price"), - assessment=parsed_data.get("Assessment"), - range_52_week=parsed_data.get("52-Week Range"), - dividend_yield=parsed_data.get("Dividend Yield"), - investment_style=parsed_data.get("Investment Style"), - report_url=data.get("Morningstar Equity Report URL"), - report_date=data.get("Morningstar Equity Report Date"), - source=parsed_data.get("source"), - ) - - if parsed_data.get("error"): - return fail(parsed_data["error"], ErrorType.PARSING, retryable=True) - - return ok(morningstar) - - finally: - try: - if page is not None: - await page.close() - except Exception: - pass - try: - if context is not None: - await context.close() - except Exception: - pass - for handle in (browser,): - try: - if handle is not None: - await handle.close() - except Exception: - pass - try: - if p is not None: - await p.stop() - except Exception: - pass - - -def _safe_int(value: Any) -> Optional[int]: - if value is None: - return None - try: - return int(str(value).strip()) - except (TypeError, ValueError): - return None diff --git a/schwab_scraper/features/transactions/__init__.py b/schwab_scraper/features/transactions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schwab_scraper/features/transactions/parser.py b/schwab_scraper/features/transactions/parser.py deleted file mode 100644 index e17d8a9..0000000 --- a/schwab_scraper/features/transactions/parser.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -import csv -import io -from dataclasses import asdict -from typing import List, Dict, Any - -from ...core.models import TransactionRecord, TransactionData, AccountInfo - - -def parse_csv_content(csv_bytes: bytes) -> List[TransactionRecord]: - """ - Parse Schwab transaction CSV bytes into a list of TransactionRecord. - - Expected headers: - Date,Action,Symbol,Description,Quantity,Price,Fees & Comm,Amount - """ - text_stream = io.StringIO(csv_bytes.decode("utf-8")) - reader = csv.DictReader(text_stream) - - records: List[TransactionRecord] = [] - for row in reader: - records.append( - TransactionRecord( - date=(row.get("Date") or "").strip(), - action=(row.get("Action") or "").strip(), - symbol=(row.get("Symbol") or None) or None, - description=(row.get("Description") or "").strip(), - quantity=(row.get("Quantity") or None) or None, - price=(row.get("Price") or None) or None, - fees_comm=(row.get("Fees & Comm") or None) or None, - amount=(row.get("Amount") or None) or None, - ) - ) - return records - - -def to_dicts(transaction_data: TransactionData) -> Dict[str, Any]: - """Convert TransactionData to plain dicts for JSON output.""" - return { - "account_info": asdict(transaction_data.account_info), - "transactions": [asdict(r) for r in transaction_data.transactions], - "date_range": transaction_data.date_range, - "export_date": transaction_data.export_date, - "total_transactions": transaction_data.total_transactions, - "source": transaction_data.source, - } diff --git a/schwab_scraper/features/transactions/scraper.py b/schwab_scraper/features/transactions/scraper.py deleted file mode 100644 index e6a7e33..0000000 --- a/schwab_scraper/features/transactions/scraper.py +++ /dev/null @@ -1,2561 +0,0 @@ -from __future__ import annotations - -import asyncio -import re -import time -from datetime import datetime, timezone -from typing import Optional, List, Dict, Any - -from ...utils.logging import save_debug_artifact - -# Export options constants -DEFAULT_HISTORY_URL = "https://client.schwab.com/app/accounts/history/#/" - - -async def goto_history(page, context=None, debug: bool = False) -> None: - if context: - from ...browser.navigation import goto_with_auth_check - auth_success = await goto_with_auth_check(page, context, DEFAULT_HISTORY_URL, debug=debug) - if not auth_success: - raise Exception("Authentication failed during navigation to history page") - else: - # Fallback for cases where context isn't available - await page.goto(DEFAULT_HISTORY_URL, timeout=60000) - await page.wait_for_load_state('domcontentloaded') - - # Wait for one of the known panels in history page to ensure full UI ready - try: - await page.wait_for_selector('.sdps-page-header__account-selector, #account-selector', timeout=15000) - except Exception: - # Fallback wait - await page.wait_for_timeout(5000) - if debug: - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_export_history_loaded.png", png) - except Exception: - pass - - -async def open_export_panel(page, debug: bool = False) -> None: - # Close any obstructing overlay dialogs first (e.g., What's changed) - try: - overlays = page.locator("div[role='dialog']").filter(has_text="What's changed") - if await overlays.count() > 0 and await overlays.first.is_visible(): - if debug: - print("DEBUG: Closing 'What's changed' overlay before export") - close_btn = overlays.first.locator("button[aria-label='Close'], button:has-text('Close')").first - try: - await close_btn.click() - except Exception: - await page.keyboard.press('Escape') - await page.wait_for_timeout(500) - except Exception: - pass - - if debug: - print("DEBUG: Clicking top-level Export button to open options panel") - # Use aria-label selector to target the visible Export button (not the hidden one in dialogs) - export_button = page.locator('button[aria-label="Export"]').first - await export_button.scroll_into_view_if_needed() - await export_button.click() - await page.wait_for_timeout(1500) - - -async def select_time_period(page, time_period: Optional[str], container=None, debug: bool = False) -> None: - if not time_period: - return - try: - scope = container or page - period_selector = scope.locator(f'text={time_period}').first - if await period_selector.is_visible(): - await period_selector.click() - await page.wait_for_timeout(1000) - if debug: - print(f"DEBUG: Selected time period: {time_period}") - except Exception: - # Non-fatal; keep defaults - pass - - -async def ensure_csv_format(page, container=None, debug: bool = False) -> None: - try: - scope = container or page - csv_option = scope.locator('text=CSV').first - if await csv_option.is_visible(): - await csv_option.click() - await page.wait_for_timeout(1000) - if debug: - print("DEBUG: Ensured CSV format is selected") - except Exception: - pass - - -def parse_suggested_filename(filename: str) -> Dict[str, str]: - """Parse Schwab's suggested filename into an account label and timestamp. - - Robustly handles extra underscores, composite account names, and suffixes. - Returns a normalized label like "Joint_XXX604" and extracted timestamp. - """ - # Timestamp - ts_match = re.search(r"(\d{8}-\d{6})", filename) - ts = ts_match.group(1) if ts_match else datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S') - - stem = filename.rsplit('.', 1)[0] - # Remove trailing _Transactions_ if present - stem_wo_suffix = re.sub(r"_Transactions_\d{8}-\d{6}$", "", stem) - - # Try direct XXX pattern - m = re.search(r"XXX(\d{3,4})", stem_wo_suffix) - if m: - ending = m.group(1) - prefix = stem_wo_suffix.split(f"XXX{ending}")[0].rstrip('_') - # Sanitize prefix to create label - prefix = re.sub(r"[^A-Za-z0-9]+", "_", prefix).strip('_') or "Account" - label = f"{prefix}_XXX{ending}" - return {"label": label, "ts": ts} - - # Try '… 604' or '... 604' or 'ending in 6 0 4' - m2 = re.search(r"(?:…|\.\.\.|ending in)\s*([0-9\s]{3,8})", stem_wo_suffix, flags=re.IGNORECASE) - if m2: - digits = re.sub(r"\s+", "", m2.group(1)) - ending = digits[-3:] - # Prefix is text before ellipsis/ending in phrase - prefix = re.split(r"(?:…|\.\.\.|ending in)", stem_wo_suffix, flags=re.IGNORECASE)[0].rstrip('_ ') - prefix = re.sub(r"[^A-Za-z0-9]+", "_", prefix).strip('_') or "Account" - label = f"{prefix}_XXX{ending}" - return {"label": label, "ts": ts} - - # Fallback - safe = re.sub(r"[^A-Za-z0-9]+", "_", stem_wo_suffix).strip('_') - return {"label": safe, "ts": ts} - - -def _label_matches_account_query(account_query: Optional[str], label: str) -> bool: - """Determine whether a parsed filename `label` (e.g., "Joint_XXX604") - matches an `account_query` which could be a full label ("PLA_Assets_XXX674"), - an ending ("604"), or a type substring ("Joint"/"PLA"). - - This function is used to verify that the downloaded file corresponds to - the intended account before we accept it. - """ - if not account_query: - return True - - query = str(account_query).strip() - label_lower = label.lower() - query_lower = query.lower() - - # Exact label match - if query == label: - return True - - # Match by ending digits in the label (from _XXX####) - m = re.search(r"XXX(\d{3,4})$", label) - if m: - ending = m.group(1) - if query.isdigit(): - # Allow matching last 3 digits - if ending.endswith(query): - return True - # Also allow matching on suffix-only query like 'XXX604' - if query_upper := query.upper(): - if query_upper == f"XXX{ending}": - return True - - # Substring in label (match by type/name) - if query_lower in label_lower: - return True - - return False - - -def _normalize_label_from_text(text: str) -> Optional[str]: - """Create a normalized account label (Type_XXX###) from raw menu text.""" - if not text: - return None - # Collapse whitespace - t = re.sub(r"\s+", " ", text).strip() - # Try XXX123 pattern - m = re.search(r"XXX(\d{3,4})", t) - if m: - ending = m.group(1)[-3:] - prefix = t.split(f"XXX{m.group(1)}")[0].strip(" -•") - prefix = re.sub(r"[^A-Za-z0-9]+", "_", prefix).strip('_') or "Account" - return f"{prefix}_XXX{ending}" - # Try 'ending in' or ellipsis with digits - m2 = re.search(r"(?:ending in|…|\.\.\.)\s*([0-9\s]{3,8})", t, flags=re.IGNORECASE) - if m2: - ending = re.sub(r"\s+", "", m2.group(1))[-3:] - prefix = re.split(r"(?:ending in|…|\.\.\.)", t, flags=re.IGNORECASE)[0].strip(" -•") - prefix = re.sub(r"[^A-Za-z0-9]+", "_", prefix).strip('_') or "Account" - return f"{prefix}_XXX{ending}" - return None - - -def parse_account_text(text): - """Parse account dropdown text to extract structured account info with enhanced pattern matching""" - text = text.strip() - lines = [line.strip() for line in text.split('\n') if line.strip()] - - account_type = None - account_ending = None - - # Enhanced pattern matching with multiple strategies - # First, check for the live Schwab format: "TypeType…XXXAccount ending in X Y Z" - live_format_match = re.search(r'^([A-Za-z\s]+)\1…(\d{3,4})Account ending in ([\d\s]+)', text) - if live_format_match: - account_type = live_format_match.group(1).strip() - account_ending = live_format_match.group(2) - # Validate the ending matches the spaced version - spaced_ending = live_format_match.group(3).replace(' ', '') - if account_ending == spaced_ending: - if account_type and account_ending: - normalized_type = account_type.replace(' ', '_').replace('-', '_') - label = f'{normalized_type}_XXX{account_ending[-3:]}' - return { - 'label': label, - 'type': account_type, - 'ending': account_ending[-3:] - } - - # Parse line by line for other formats - for line in lines: - # Strategy 1: 'Account ending in X Y Z' format - ending_match = re.search(r'Account ending in (\d \d \d)', line) - if ending_match: - account_ending = ending_match.group(1).replace(' ', '') - continue - - # Strategy 2: 'Account ending in XXX' format (without spaces) - ending_match_no_space = re.search(r'Account ending in (\d{3,4})', line) - if ending_match_no_space: - account_ending = ending_match_no_space.group(1) - continue - - # Strategy 3: Account type with …XXX or ...XXX pattern - type_match = re.search(r'^([A-Za-z\s]+)\s*[…\.]{1,3}(\d{3,4})', line) - if type_match: - account_type = type_match.group(1).strip() - account_ending = type_match.group(2) - continue - - # Strategy 4: Account type with XXX pattern - type_match_xxx = re.search(r'^([A-Za-z\s]+)\s*XXX(\d{3,4})', line) - if type_match_xxx: - account_type = type_match_xxx.group(1).strip() - account_ending = type_match_xxx.group(2) - continue - - # Strategy 5: Direct account type and ending pattern (e.g., "Joint 604") - direct_match = re.search(r'^([A-Za-z\s]+?)\s+(\d{3,4})\s*$', line) - if direct_match: - candidate_type = direct_match.group(1).strip() - candidate_ending = direct_match.group(2) - # Only accept if it looks like a known account type - if any(known_type.lower() in candidate_type.lower() - for known_type in ['joint', 'ira', 'individual', 'bogle', 'roth', 'general', 'pla', 'checking']): - account_type = candidate_type - account_ending = candidate_ending - continue - - # Strategy 6: Just account type name (for multi-line parsing) - known_account_types = [ - 'Joint Account', 'Joint', 'IRA Account', 'IRA', 'Individual Account', 'Individual', 'Bogle', - 'IRA Rachel', 'Roth IRA Rachel', 'PLA Assets', 'Roth IRA', 'ROTH IRA', 'General Checking', - 'PLA Line', 'Roth', 'Traditional IRA' - ] - # Try exact match first - if line in known_account_types and not account_type: - account_type = line - - # Strategy 7: Partial matches for compound account types (preserve original line) - if not account_type: - for known_type in ['joint', 'ira', 'individual', 'bogle', 'roth', 'general', 'pla', 'checking']: - if known_type in line.lower() and len(line.strip()) < 50 and len(line.strip()) > 2: - # Use the original line text to preserve exact formatting - account_type = line.strip() - break - - # Final validation and formatting - if account_type and account_ending: - # Ensure ending is at least 3 digits - if len(account_ending) >= 3: - # Normalize account type for labeling - normalized_type = account_type.replace(' ', '_').replace('-', '_') - label = f'{normalized_type}_XXX{account_ending[-3:]}' - return { - 'label': label, - 'type': account_type, - 'ending': account_ending[-3:] - } - - # Debug fallback - if we have promising text but couldn't parse it - if any(keyword in text.lower() for keyword in ['joint', 'ira', 'individual', 'bogle', 'account']): - # Extract any 3-4 digit number as potential account ending - digit_match = re.search(r'\b(\d{3,4})\b', text) - if digit_match: - # Try to extract account type from context - for keyword in ['joint', 'ira', 'individual', 'bogle', 'roth', 'general', 'pla']: - if keyword in text.lower(): - account_type = keyword.title() - account_ending = digit_match.group(1) - label = f'{account_type}_XXX{account_ending[-3:]}' - return { - 'label': label, - 'type': account_type, - 'ending': account_ending[-3:] - } - - return None - - -async def discover_accounts_with_numbers(page, debug: bool = False) -> List[Dict[str, str]]: - """Discover accounts including their actual account numbers for API switching. - - Returns list of account info including: - - label: Normalized label like "PLA_Assets_XXX674" - - type: Account type like "PLA Assets" - - ending: Last 3 digits like "674" - - account_number: Full account number like "7485-7674" (if available) - """ - if debug: - print("DEBUG: Discovering accounts with account numbers...") - - # First get basic account info - basic_accounts = await discover_accounts_from_page(page, debug=debug) - - # Now try to get account numbers by examining dropdown elements more closely - try: - # Click account selector to open dropdown - await page.locator('.sdps-page-header__account-selector, #account-selector').first.click() - await page.wait_for_timeout(2000) - - # Look for elements with account numbers - account_elements = await page.evaluate(''' - () => { - const elements = Array.from(document.querySelectorAll('button, a, [data-account], [data-number]')); - return elements.map(el => { - const text = (el.textContent || el.innerText || '').trim(); - const dataAccount = el.getAttribute('data-account'); - const dataNumber = el.getAttribute('data-number'); - const onclick = el.onclick ? el.onclick.toString() : ''; - const href = el.href || ''; - - // Look for account numbers in various attributes - let accountNumber = null; - - // Check data attributes - if (dataAccount && dataAccount.includes('-')) { - accountNumber = dataAccount; - } else if (dataNumber && dataNumber.includes('-')) { - accountNumber = dataNumber; - } - - // Check onclick handlers for account numbers - const numberMatch = onclick.match(/(\\d{4}-\\d{3,4})/); - if (numberMatch) { - accountNumber = numberMatch[1]; - } - - // Check href for account numbers - const hrefMatch = href.match(/(\\d{4}-\\d{3,4})/); - if (hrefMatch) { - accountNumber = hrefMatch[1]; - } - - return { - text: text.substring(0, 100), - accountNumber: accountNumber, - element: el.tagName + (el.id ? '#' + el.id : '') + (el.className ? '.' + el.className.split(' ')[0] : '') - }; - }).filter(item => item.text.includes('ending in') || item.accountNumber); - } - ''') - - if debug: - print(f"DEBUG: Found {len(account_elements)} elements with potential account numbers") - for elem in account_elements[:5]: - print(f"DEBUG: - {elem['text'][:50]} -> {elem['accountNumber']} ({elem['element']})") - - # Match account numbers to basic account info - enhanced_accounts = [] - for account in basic_accounts: - enhanced_account = account.copy() - enhanced_account['account_number'] = None - - # Try to find matching account number - account_ending = account['ending'] - for elem in account_elements: - if account_ending in elem['text'] and elem['accountNumber']: - enhanced_account['account_number'] = elem['accountNumber'] - break - - enhanced_accounts.append(enhanced_account) - - # Close dropdown - try: - await page.keyboard.press('Escape') - await page.wait_for_timeout(500) - except: - pass - - if debug: - print(f"DEBUG: Enhanced accounts with numbers:") - for acc in enhanced_accounts: - print(f"DEBUG: - {acc['label']} -> {acc.get('account_number', 'NO_NUMBER')}") - - return enhanced_accounts - - except Exception as e: - if debug: - print(f"DEBUG: Error discovering account numbers: {e}") - # Fall back to basic accounts without numbers - return [dict(acc, account_number=None) for acc in basic_accounts] - - -async def discover_accounts_from_page(page, debug: bool = False) -> List[Dict[str, str]]: - """Discover account entries from the page-level selector dropdown with enhanced reliability.""" - # Note: This function assumes the page is already on the transaction history page - - if debug: - print("DEBUG: Starting enhanced account discovery...") - # Take initial screenshot - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_account_discovery_start.png", png) - except Exception: - pass - - # Enhanced account selector strategy with multiple attempts - click_success = False - max_attempts = 3 - - for attempt in range(max_attempts): - if debug: - print(f"DEBUG: Attempt {attempt + 1}/{max_attempts} - Searching for account selector...") - - # Enhanced selector discovery with more patterns - account_selector_candidates = await page.evaluate(''' - () => { - const selectors = [ - '#account-selector', - '.sdps-page-header__account-selector', - '[id*="account-selector"]', - '[class*="account-selector"]', - 'button[aria-label*="Account"]', - 'button[title*="Account"]', - '[data-testid*="account"]', - 'button:has-text("Account")', - '[class*="account"][class*="dropdown"]', - '[class*="account"][class*="button"]' - ]; - - const results = []; - for (const selector of selectors) { - try { - const elements = document.querySelectorAll(selector); - elements.forEach((el, i) => { - if (el.offsetParent !== null && el.offsetWidth > 0 && el.offsetHeight > 0) { - const text = (el.textContent || el.innerText || '').trim(); - results.push({ - selector: selector, - index: i, - id: el.id, - className: el.className, - text: text.substring(0, 100), - tagName: el.tagName.toLowerCase(), - isVisible: el.offsetParent !== null, - hasAccountText: text.toLowerCase().includes('account') || text.match(/\\d{3}/) !== null - }); - } - }); - } catch (e) { - // Skip selectors that cause errors - } - } - return results.sort((a, b) => (b.hasAccountText ? 1 : 0) - (a.hasAccountText ? 1 : 0)); - } - ''') - - if debug and len(account_selector_candidates) > 0: - print(f"DEBUG: Found {len(account_selector_candidates)} potential account selector elements:") - for candidate in account_selector_candidates[:5]: # Show top candidates - className = candidate.get('className', '')[:50] if candidate.get('className') else '' - print(f"DEBUG: - {candidate['tagName']} {candidate['selector']}#{candidate['id']}.{className} text: '{candidate['text'][:50]}' hasAccountText: {candidate.get('hasAccountText')}") - - # Try clicking with enhanced strategy - clicked = await page.evaluate(''' - () => { - const selectors = [ - '.sdps-page-header__account-selector', - '#account-selector', - '[id*="account-selector"]', - '[class*="account-selector"]', - 'button[aria-label*="Account"]' - ]; - - for (const selector of selectors) { - const elements = document.querySelectorAll(selector); - for (const button of elements) { - if (button.offsetParent !== null && button.offsetWidth > 0 && button.offsetHeight > 0) { - try { - button.scrollIntoView({ behavior: 'smooth', block: 'center' }); - button.click(); - return { success: true, selector: selector, text: (button.textContent || '').trim().substring(0, 50) }; - } catch (e) { - continue; - } - } - } - } - return { success: false }; - } - ''') - - if debug: - print(f"DEBUG: Account selector click result: {clicked}") - - if clicked.get('success'): - click_success = True - break - - # Wait before retry - if attempt < max_attempts - 1: - if debug: - print(f"DEBUG: Click attempt {attempt + 1} failed, waiting before retry...") - await page.wait_for_timeout(2000) - - if not click_success: - if debug: - print("DEBUG: All account selector click attempts failed") - # Take failure screenshot - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_account_selector_click_failed.png", png) - except Exception: - pass - return [] - - # Wait longer for dropdown to appear after successful click - await page.wait_for_timeout(4000) - - # Enhanced dropdown discovery with better pattern matching - dropdown = None - dropdown_search_attempts = 2 - - for search_attempt in range(dropdown_search_attempts): - if debug: - print(f"DEBUG: Dropdown search attempt {search_attempt + 1}/{dropdown_search_attempts}") - - # Enhanced dropdown selector strategy - dropdown_candidates = await page.evaluate(''' - () => { - const selectors = [ - '[role="menu"]', - '[role="listbox"]', - '[role="dialog"]', - '[class*="dropdown"]', - '[class*="menu"]', - '[class*="overlay"]', - '[class*="modal"]', - '[class*="account"]', - '[class*="selector"]', - 'div[style*="position: absolute"]', - 'div[style*="z-index"]' - ]; - - const candidates = []; - for (const selector of selectors) { - try { - const elements = document.querySelectorAll(selector); - elements.forEach((elem, i) => { - if (elem.offsetParent !== null && elem.offsetWidth > 0 && elem.offsetHeight > 0) { - const text = (elem.textContent || elem.innerText || '').trim(); - const hasAccountPattern = ( - text.includes('ending in') || - /…\\d{3,4}|XXX\\d{3,4}|\\.\\.\\.\\d{3,4}/.test(text) || - (/joint|ira|individual|bogle|account/i.test(text) && /\\d{3}/.test(text)) - ); - - if (text.length > 10 && hasAccountPattern) { - candidates.push({ - selector: selector, - index: i, - element: elem, - text: text.substring(0, 200), - score: hasAccountPattern ? 1 : 0, - className: elem.className - }); - } - } - }); - } catch (e) { - // Skip problematic selectors - } - } - return candidates.sort((a, b) => b.score - a.score); - } - ''') - - if debug: - print(f"DEBUG: Found {len(dropdown_candidates)} dropdown candidates") - for candidate in dropdown_candidates[:3]: # Show top candidates - preview = candidate.get('text', '').replace('\n', ' ')[:100] - print(f"DEBUG: - {candidate['selector']} (score: {candidate.get('score')}) text: {preview}") - - # Select best candidate - if dropdown_candidates: - dropdown = await page.query_selector_all(dropdown_candidates[0]['selector']) - if dropdown: - dropdown = dropdown[dropdown_candidates[0]['index']] - if debug: - print(f"DEBUG: Selected dropdown with selector: {dropdown_candidates[0]['selector']}") - break - - # If no dropdown found, wait and try again - if search_attempt < dropdown_search_attempts - 1: - if debug: - print("DEBUG: No suitable dropdown found, waiting and retrying...") - await page.wait_for_timeout(2000) - # Try clicking again in case dropdown closed - await page.evaluate(''' - () => { - const button = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - if (button) button.click(); - } - ''') - await page.wait_for_timeout(2000) - - if not dropdown: - # Close any open dropdowns and return empty - await page.click('body') - if debug: - print("DEBUG: No suitable account dropdown found after all attempts") - # Take failure screenshot for debugging - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_account_dropdown_not_found.png", png) - except Exception: - pass - return [] - - # Enhanced account parsing with better error handling - if debug: - # Take screenshot of dropdown for debugging - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_account_dropdown_opened.png", png) - except Exception: - pass - - # Get all potential account elements with enhanced selection - account_elements = await dropdown.query_selector_all('button, a, [role="option"], li, div, span') - accounts = [] - seen_endings = set() - - if debug: - print(f"DEBUG: Found {len(account_elements)} potential account elements in dropdown") - - # Enhanced parsing with multiple strategies - for elem in account_elements: - try: - text = await elem.inner_text() - if not text or len(text.strip()) < 3: - continue - - # Enhanced pattern matching for account detection - has_account_pattern = ( - 'ending in' in text or - re.search(r'\d \d \d', text) or - re.search(r'…\d{3,4}|XXX\d{3,4}|\.\.\.\d{3,4}', text) or - (any(keyword in text.lower() for keyword in ['joint', 'ira', 'individual', 'bogle', 'roth', 'general', 'pla']) and re.search(r'\d{3}', text)) - ) - - if not has_account_pattern: - continue - - # Skip navigation and header elements - skip_phrases = [ - 'Edit Account Nicknames & Groups', 'Other Accounts', 'Brokerage Accounts', 'Schwab Bank Accounts', - 'Select an account', 'Account selector', 'Choose account', 'Switch account' - ] - if any(skip_phrase in text for skip_phrase in skip_phrases): - continue - - parsed = parse_account_text(text) - if parsed and parsed['ending'] not in seen_endings: - seen_endings.add(parsed['ending']) - accounts.append(parsed) - if debug: - print(f"DEBUG: Successfully parsed account: {parsed['type']} ending in {parsed['ending']} (label: {parsed['label']})") - elif debug and text.strip(): - print(f"DEBUG: Failed to parse account text: '{text[:100]}'") - - except Exception as e: - if debug: - print(f"DEBUG: Error processing account element: {e}") - continue - - # Close dropdown with enhanced cleanup - try: - await page.keyboard.press('Escape') # Try escape first - await page.wait_for_timeout(500) - await page.click('body') # Fallback click - await page.wait_for_timeout(1000) - except Exception: - pass - - if debug: - print(f"DEBUG: Successfully discovered {len(accounts)} accounts from dropdown") - if accounts: - for account in accounts: - print(f"DEBUG: - {account['label']} ({account['type']} ending {account['ending']})") - - return accounts - - -async def _resolve_export_dialog(page, debug: bool = False): - """Find the export transactions dialog robustly. - Prefer dialog with aria-labelledby containing 'export-transactions', - otherwise choose the last visible dialog that contains a CSV option or Export button. - """ - dialogs = page.locator("div[role='dialog']") - - # Strategy 1: aria-labelledby hint - candidate = page.locator("div[role='dialog'][aria-labelledby*='export-transactions']").last - if await candidate.count() > 0 and await candidate.is_visible(): - if debug: - print("DEBUG: Found export dialog via aria-labelledby contains 'export-transactions'") - return candidate - - # Strategy 2: visible dialog that contains CSV option - csv_candidate = dialogs.filter(has=page.locator("text=CSV")).last - if await csv_candidate.count() > 0 and await csv_candidate.is_visible(): - if debug: - print("DEBUG: Found export dialog via presence of CSV option") - return csv_candidate - - # Strategy 3: visible dialog that contains an Export button - export_candidate = dialogs.filter(has=page.locator("button:has-text('Export')")).last - if await export_candidate.count() > 0 and await export_candidate.is_visible(): - if debug: - print("DEBUG: Found export dialog via presence of dialog Export button") - return export_candidate - - # Strategy 4: fallback to last dialog - if debug: - print("DEBUG: Falling back to last dialog; may be incorrect") - return dialogs.last - - -async def _ensure_account_in_export_dialog(page, dialog, account_query: Optional[str], debug: bool = False) -> bool: - """Ensure the export dialog, if it contains its own account selector, is set to the requested account. - - Returns True if either no dialog-level account selector exists or it was set/matched successfully. - Returns False if a dialog-level selector exists but we could not match/select target account. - """ - if not account_query: - return True - - try: - # Try to detect a dialog-level account indicator - current_in_dialog = await dialog.evaluate('''(root) => { - const text = (root.textContent || '').trim(); - return text ? text.substring(0, 300) : ''; - }''') - if debug: - print(f"DEBUG: Export dialog initial text preview: {current_in_dialog[:120]}…") - - # If dialog text already contains our target pattern, consider it set - def _to_match_str(q: str) -> str: - return q.replace('_XXX', ' ending in ').replace('_', ' ') - if current_in_dialog and _to_match_str(account_query) in current_in_dialog: - if debug: - print("DEBUG: Dialog appears to already reference the target account") - return True - - # Try to find a dialog-level account selector trigger (combobox/button) - selector_candidates = [ - '[role="combobox"]', - 'button:has-text("Account")', - 'button[aria-haspopup="listbox"]', - '[aria-controls*="account"], [id*="account"], [class*="account"]' - ] - - found_trigger = None - for sel in selector_candidates: - try: - loc = dialog.locator(sel).first - if await loc.count() > 0 and await loc.is_visible(): - found_trigger = loc - break - except Exception: - continue - - if not found_trigger: - # No obvious dialog-level selector; assume page-level selection applies - if debug: - print("DEBUG: No dialog-level account selector found; relying on page-level selection") - return True - - # Open the dialog-level account dropdown - try: - await found_trigger.scroll_into_view_if_needed() - await found_trigger.click() - await page.wait_for_timeout(500) - except Exception: - pass - - # Find options container within dialog - options_container = None - option_container_selectors = [ - '[role="listbox"]', '[role="menu"]', '[class*="menu"]', '[class*="list"]', '[class*="dropdown"]' - ] - for sel in option_container_selectors: - try: - el = await dialog.query_selector(sel) - if el: - options_container = el - break - except Exception: - continue - - if not options_container: - # Fall back to page-wide, but prefer the dialog scope - options_container = dialog - - # Collect option-like elements and try to match - option_elements = await options_container.query_selector_all('button, a, [role="option"], li, div, span') - if debug: - print(f"DEBUG: Found {len(option_elements)} dialog option elements") - - # Define a helper to parse option text - target = None - for elem in option_elements: - try: - text = await elem.inner_text() - except Exception: - continue - if not text or len(text.strip()) < 3: - continue - parsed = parse_account_text(text) - if not parsed: - continue - if (account_query == parsed['label'] or - account_query == parsed['ending'] or - account_query.lower() in parsed['label'].lower() or - account_query.lower() in parsed['type'].lower()): - target = (elem, parsed) - break - - if not target: - if debug: - print("DEBUG: No matching account option found in dialog-level selector") - return False - - elem, parsed = target - # Click the matching option - try: - await page.evaluate('(el) => el.scrollIntoView({behavior: "smooth", block: "center"})', elem) - except Exception: - pass - click_ok = False - for _ in range(3): - try: - await elem.click(force=True) - click_ok = True - break - except Exception: - await page.wait_for_timeout(150) - continue - - if not click_ok: - if debug: - print("DEBUG: Failed to click dialog-level account option") - return False - - await page.wait_for_timeout(500) - - # Verify the dialog now references target account - try: - after_text = await dialog.evaluate('(root) => (root.textContent || "").trim().substring(0, 300)') - except Exception: - after_text = None - if after_text and _to_match_str(account_query) in after_text: - if debug: - print("DEBUG: Dialog-level account selection verified") - return True - if debug: - print("DEBUG: Dialog-level account selection not verified; proceeding anyway") - return True - except Exception as e: - if debug: - print(f"DEBUG: Exception in _ensure_account_in_export_dialog: {e}") - return True - -async def switch_account_with_verification(page, account_query: str, debug: bool = False) -> bool: - """Enhanced account switching with verification based on successful test script. - - Args: - page: Playwright page object - account_query: Account identifier (ending digits, type, or full label like 'PLA_Assets_XXX674') - debug: Enable debug output - - Returns: - True if switch was successful and verified, False otherwise - """ - if not account_query: - return False - - try: - if debug: - print(f"DEBUG: Starting enhanced account switch for: {account_query}") - - # Parse the account query to determine target - target_ending = None - target_type = None - - if "_XXX" in account_query: - parts = account_query.split("_XXX") - target_type = parts[0].replace("_", " ") - target_ending = parts[1][-3:] if len(parts[1]) >= 3 else parts[1] - elif account_query.isdigit() and len(account_query) >= 3: - target_ending = account_query[-3:] - else: - # Assume it's a type string like "PLA Assets" - target_type = account_query - - if debug: - print(f"DEBUG: Parsed target - type: '{target_type}', ending: '{target_ending}'") - - # Check current account selection first - current_account = await page.evaluate(''' - () => { - const button = document.querySelector('#account-selector'); - if (button) { - return button.textContent.trim(); - } - return ''; - } - ''') - - if debug: - print(f"DEBUG: Current account: {current_account}") - - # Check if we're already on the correct account - has_target_keywords = False - has_correct_ending = False - - if target_type: - # Check for both parts of target type (e.g., "PLA" AND "Assets") - type_parts = target_type.lower().split() - has_target_keywords = all(part in current_account.lower() for part in type_parts) - - if target_ending: - has_correct_ending = f"ending in {' '.join(target_ending)}" in current_account.lower() - - is_on_target = (has_target_keywords and has_correct_ending) if target_type and target_ending else \ - has_target_keywords if target_type else \ - has_correct_ending if target_ending else False - - if debug: - print(f"DEBUG: Keywords match: {has_target_keywords}, Ending match: {has_correct_ending}") - print(f"DEBUG: Already on target account: {is_on_target}") - - if is_on_target: - if debug: - print("DEBUG: Already on correct account, no switch needed") - return True - - # Need to switch - open account selector dropdown - if debug: - print("DEBUG: Opening account selector dropdown...") - - await page.locator('.sdps-page-header__account-selector, #account-selector').first.click() - await page.wait_for_timeout(2000) - - # Find all account options in dropdown - all_account_links = await page.query_selector_all('a[id*="account-selector-header"]') - if debug: - print(f"DEBUG: Found {len(all_account_links)} account options in dropdown") - - # Look for target account option - clicked_target = False - for i, link in enumerate(all_account_links): - link_text = await link.inner_text() - if debug: - print(f"DEBUG: Option {i+1}: {link_text}") - - # Check if this matches our target - text_lower = link_text.lower() - is_match = False - - if target_type and target_ending: - type_parts = target_type.lower().split() - has_type = all(part in text_lower for part in type_parts) - has_ending = target_ending in link_text - is_match = has_type and has_ending - elif target_type: - type_parts = target_type.lower().split() - is_match = all(part in text_lower for part in type_parts) - elif target_ending: - is_match = target_ending in link_text - - if is_match: - if debug: - print(f"DEBUG: ✓ Found target account option: {link_text}") - try: - # Try force click first - await link.click(force=True) - clicked_target = True - if debug: - print("DEBUG: ✓ Clicked account option (force)") - break - except Exception as e1: - if debug: - print(f"DEBUG: Force click failed: {e1}") - try: - # Try JavaScript click as fallback - await link.evaluate("element => element.click()") - clicked_target = True - if debug: - print("DEBUG: ✓ Clicked account option (JS)") - break - except Exception as e2: - if debug: - print(f"DEBUG: JS click also failed: {e2}") - continue - - if not clicked_target: - if debug: - print("DEBUG: ❌ Could not find or click target account option") - return False - - # Wait for page to update after account switch - if debug: - print("DEBUG: Waiting for page to update after account switch...") - await page.wait_for_timeout(3000) - - # Reload page to get fresh data for the new account - if debug: - print("DEBUG: Reloading page to get fresh data for selected account...") - await page.reload() - await page.wait_for_load_state('domcontentloaded') - await page.wait_for_timeout(2000) - - # Verify the account switch was successful - if debug: - print("DEBUG: Verifying account switch...") - - final_account = await page.evaluate(''' - () => { - const button = document.querySelector('#account-selector'); - if (button) { - return button.textContent.trim(); - } - return ''; - } - ''') - - if debug: - print(f"DEBUG: Final account: {final_account}") - - # Verify we're now on the target account - final_has_keywords = False - final_has_ending = False - - if target_type: - type_parts = target_type.lower().split() - final_has_keywords = all(part in final_account.lower() for part in type_parts) - - if target_ending: - final_has_ending = f"ending in {' '.join(target_ending)}" in final_account.lower() - - final_is_on_target = (final_has_keywords and final_has_ending) if target_type and target_ending else \ - final_has_keywords if target_type else \ - final_has_ending if target_ending else False - - if final_is_on_target: - if debug: - print("DEBUG: ✅ Account switch verification successful!") - return True - else: - if debug: - print(f"DEBUG: ❌ Account switch verification failed!") - print(f"DEBUG: Expected type '{target_type}' ending '{target_ending}'") - print(f"DEBUG: Got: {final_account}") - return False - - except Exception as e: - if debug: - print(f"DEBUG: Exception in switch_account_with_verification: {e}") - return False - - -async def switch_account_via_api(page, account_number: str, debug: bool = False) -> bool: - """Switch account using Schwab's SwitchAccount API endpoint. - - Args: - page: Playwright page object - account_number: Account number in format "1234-5678" - debug: Enable debug output - - Returns: - True if switch was successful, False otherwise - """ - try: - if debug: - print(f"DEBUG: Switching to account {account_number} via API...") - - # Make POST request to SwitchAccount endpoint - response = await page.evaluate(''' - async (accountNumber) => { - try { - const response = await fetch('/Areas/MvcGlobal/SwitchAccount', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - selectionType: 'S', - accountNumber: accountNumber - }) - }); - const data = await response.json(); - return { success: response.ok, status: response.status, data: data }; - } catch (error) { - return { success: false, error: error.message }; - } - } - ''', account_number) - - if debug: - print(f"DEBUG: SwitchAccount API response: {response}") - - if response.get('success') and response.get('status') == 200: - # Wait for page to reflect the account change - await page.wait_for_timeout(2000) - - # Verify the switch worked by checking current account - current_account = await page.evaluate(''' - () => { - const header = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return header ? (header.textContent || '').trim() : ''; - } - ''') - - if debug: - print(f"DEBUG: Account after API switch: {current_account[:100]}") - - return True - else: - if debug: - print(f"DEBUG: SwitchAccount API failed: {response}") - return False - - except Exception as e: - if debug: - print(f"DEBUG: Exception in switch_account_via_api: {e}") - return False - - -async def switch_account_on_page(page, account_query: Optional[str], context=None, debug: bool = False) -> bool: - """Attempt to switch account using the page-level selector given a query like '604' or 'Joint'.""" - if not account_query: - return False - - try: - # ENHANCED DEBUGGING: Add detailed logging for production troubleshooting - if debug: - print(f"DEBUG: === ACCOUNT SWITCH DEBUG START ===") - print(f"DEBUG: Requested account: {account_query}") - print(f"DEBUG: Current URL: {page.url}") - - # Ensure on the history page - if 'accounts/history' not in page.url: - if debug: - print("DEBUG: Not on history page, navigating...") - await goto_history(page, context=context, debug=debug) - - # ENHANCED DEBUGGING: Take screenshot before attempting switch - if debug: - try: - png = await page.screenshot(full_page=True) - save_debug_artifact(f"debug_before_account_switch_{account_query}.png", png) - print("DEBUG: Screenshot saved before account switch attempt") - except Exception as e: - print(f"DEBUG: Failed to take screenshot: {e}") - - # Use enhanced selector discovery like discover_accounts_from_page - click_success = False - max_attempts = 3 - - for attempt in range(max_attempts): - if debug: - print(f"DEBUG: Account switch attempt {attempt + 1}/{max_attempts} for query: {account_query}") - - # ENHANCED DEBUGGING: Log current page state - if debug: - current_text = await page.evaluate('() => document.body.innerText.substring(0, 200)') - print(f"DEBUG: Current page text preview: {current_text}") - - # Enhanced selector discovery with multiple patterns - clicked = await page.evaluate(''' - () => { - const selectors = [ - '.sdps-page-header__account-selector', - '#account-selector', - '[id*="account-selector"]', - '[class*="account-selector"]', - 'button[aria-label*="Account"]', - 'button[title*="Account"]', - '[data-testid*="account"]', - 'button', // Generic button selector - '[class*="account"][class*="dropdown"]', - '[class*="account"][class*="button"]' - ]; - - for (const selector of selectors) { - const elements = document.querySelectorAll(selector); - for (const button of elements) { - if (button.offsetParent !== null && button.offsetWidth > 0 && button.offsetHeight > 0) { - try { - button.scrollIntoView({ behavior: 'auto', block: 'center' }); - // Use a slight delay before clicking to avoid context destruction issues - setTimeout(() => { - try { button.click(); } catch(e) {} - }, 10); - return { success: true, selector: selector, text: (button.textContent || '').trim().substring(0, 50) }; - } catch (e) { - continue; - } - } - } - } - return { success: false }; - } - ''') - - if debug: - print(f"DEBUG: Account selector click result: {clicked}") - - if clicked.get('success'): - click_success = True - break - - # Wait before retry - if attempt < max_attempts - 1: - if debug: - print(f"DEBUG: Click attempt {attempt + 1} failed, waiting before retry...") - await page.wait_for_timeout(2000) - - if not click_success: - if debug: - print("DEBUG: All account selector click attempts failed") - return False - - # ENHANCED DEBUGGING: Take screenshot after clicking selector - if debug: - try: - png = await page.screenshot(full_page=True) - save_debug_artifact(f"debug_after_selector_click_{account_query}.png", png) - print("DEBUG: Screenshot saved after selector click") - except Exception as e: - print(f"DEBUG: Failed to take screenshot: {e}") - - # Wait for dropdown to appear - await page.wait_for_timeout(300) - - # QUICK PATH: Try direct locator-based selection that pierces shadow DOM - try: - import re as _re - # Build robust name regex: match type and ending in compact or spaced form - q = str(account_query) - target_type = None - target_ending = None - if '_XXX' in q: - parts = q.split('_XXX') - target_type = parts[0].replace('_', ' ') - target_ending = parts[1][-3:] - elif q.isdigit() and len(q) in (3, 4): - target_ending = q[-3:] - name_regex = None - if target_type and target_ending: - spaced = ' '.join(list(target_ending)) - name_regex = _re.compile(rf"{_re.escape(target_type)}.*({_re.escape(target_ending)}|{_re.escape(spaced)}|XXX{_re.escape(target_ending)})", _re.I) - elif target_ending: - spaced = ' '.join(list(target_ending)) - name_regex = _re.compile(rf"({_re.escape(target_ending)}|{_re.escape(spaced)}|XXX{_re.escape(target_ending)})", _re.I) - else: - name_regex = _re.compile(_re.escape(q), _re.I) - - # Try ARIA-controlled listbox via header button first - try: - btn_loc = page.locator('#account-selector').first - controls_id = None - try: - controls_id = await btn_loc.get_attribute('aria-controls') - except Exception: - controls_id = None - if controls_id: - listbox = page.locator(f'#{controls_id}') - if await listbox.count() > 0 and await listbox.is_visible(): - # focus listbox and use get_by_role within it - try: - await listbox.focus() - except Exception: - pass - target_loc = None - # prefer role=option inside listbox - try: - opt = listbox.get_by_role('option', name=name_regex) - if await opt.count() > 0: - target_loc = opt.first - except Exception: - target_loc = None - if not target_loc: - # fallback to text filter - for css in ['[role="option"]', 'button', 'a', 'div', 'span', 'li']: - try: - cand = listbox.locator(css).filter(has_text=name_regex) - if await cand.count() > 0: - target_loc = cand.first - break - except Exception: - continue - if target_loc is not None: - try: - await target_loc.scroll_into_view_if_needed() - except Exception: - pass - try: - async with page.expect_navigation(wait_until='domcontentloaded', timeout=10000): - await target_loc.click() - except Exception: - await target_loc.click(force=True) - try: - await page.wait_for_load_state('domcontentloaded', timeout=5000) - except Exception: - pass - # Verify header reflects change - try: - header_now = await page.evaluate('''() => { - const sel = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return sel ? (sel.textContent || '').trim() : ''; - }''') - except Exception: - header_now = '' - if debug and header_now: - print(f"DEBUG: Header after listbox-controlled click: {header_now[:120]}...") - ok = False - if target_ending: - spaced = ' '.join(list(target_ending)) - if header_now and (target_ending in header_now or spaced in header_now): - ok = True - if target_type and ok: - ok = target_type.lower() in (header_now or '').lower() - if ok: - if debug: - print("DEBUG: Listbox-controlled selection verified") - return True - except Exception as e: - if debug: - print(f"DEBUG: aria-controls listbox path failed: {e}") - - # Try common roles first - for sel in [ - ('role=menuitem', page.get_by_role('menuitem', name=name_regex)), - ('role=option', page.get_by_role('option', name=name_regex)), - ('button', page.locator('button').filter(has_text=name_regex)), - ('a', page.locator('a').filter(has_text=name_regex)), - ('div', page.locator('div').filter(has_text=name_regex)), - ('span', page.locator('span').filter(has_text=name_regex)), - ]: - label, locator = sel - try: - count = await locator.count() - except Exception: - count = 0 - if count and count > 0: - target_loc = locator.first - try: - await target_loc.scroll_into_view_if_needed() - except Exception: - pass - try: - async with page.expect_navigation(wait_until='domcontentloaded', timeout=10000): - await target_loc.click() - except Exception: - await target_loc.click(force=True) - try: - await page.wait_for_load_state('domcontentloaded', timeout=5000) - except Exception: - pass - # Verify header reflects change - try: - header_now = await page.evaluate('''() => { - const sel = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return sel ? (sel.textContent || '').trim() : ''; - }''') - except Exception: - header_now = '' - if debug and header_now: - print(f"DEBUG: Header after locator-based click ({label}): {header_now[:120]}...") - ok = False - if target_ending: - spaced = ' '.join(list(target_ending)) - if header_now and (target_ending in header_now or spaced in header_now): - ok = True - if target_type and ok: - ok = target_type.lower() in (header_now or '').lower() - if ok: - if debug: - print("DEBUG: Locator-based selection verified") - return True - if debug: - print("DEBUG: Locator-based selection did not find a clickable element; falling back") - except Exception as e: - if debug: - print(f"DEBUG: Locator-based selection failed: {e}") - - # ENHANCED DEBUGGING: Check what's actually visible after dropdown click - if debug: - visible_elements = await page.evaluate(''' - () => { - const elements = document.querySelectorAll('[role="menu"], [role="listbox"], [role="dialog"], [class*="dropdown"], [class*="menu"], [class*="overlay"], [class*="modal"], [class*="account"], [class*="selector"], div[style*="position: absolute"], div[style*="z-index"]'); - return Array.from(elements).slice(0, 5).map(el => ({ - tag: el.tagName, - class: el.className, - id: el.id, - text: (el.textContent || el.innerText || '').trim().substring(0, 100), - visible: el.offsetParent !== null && el.offsetWidth > 0 && el.offsetHeight > 0 - })); - } - ''') - print(f"DEBUG: Visible dropdown elements: {visible_elements}") - - # Discover available accounts from the dropdown - accounts = await discover_accounts_from_page(page, debug=debug) - - if not accounts: - if debug: - print("DEBUG: No accounts discovered from dropdown") - return False - - if debug: - print(f"DEBUG: Discovered {len(accounts)} accounts from dropdown") - for acc in accounts: - print(f"DEBUG: - {acc['label']} ({acc['type']} ending {acc['ending']})") - - # ENHANCED DEBUGGING: Verify the account we're looking for exists - if debug: - matching_accounts = [acc for acc in accounts if account_query == acc['label'] or account_query == acc['ending'] or account_query.lower() in acc['label'].lower() or account_query.lower() in acc['type'].lower()] - print(f"DEBUG: Accounts matching query '{account_query}': {matching_accounts}") - - # Find matching account using robust matching logic - target_account = None - - # Try multiple matching strategies - for account in accounts: - # Strategy 1: Exact label match (e.g., "PLA_Assets_XXX674") - if account_query == account['label']: - target_account = account - break - - # Strategy 2: Match by ending digits (e.g., "674") - if account_query == account['ending']: - target_account = account - break - - # Strategy 3: Case-insensitive substring match in label - if account_query.lower() in account['label'].lower(): - target_account = account - break - - # Strategy 4: Match by account type (e.g., "PLA" in "PLA_Assets_XXX674") - if account_query.lower() in account['type'].lower(): - target_account = account - break - - if not target_account: - if debug: - print(f"DEBUG: No matching account found for query: {account_query}") - print(f"DEBUG: Available accounts: {[acc['label'] for acc in accounts]}") - return False - - if debug: - print(f"DEBUG: Found target account: {target_account['label']}") - - # ENHANCED DEBUGGING: Take screenshot before clicking target account - if debug: - try: - png = await page.screenshot(full_page=True) - save_debug_artifact(f"debug_before_target_click_{account_query}.png", png) - print("DEBUG: Screenshot saved before target account click") - except Exception as e: - print(f"DEBUG: Failed to take screenshot: {e}") - - # Try a direct ARIA role-based click first for reliability - try: - ending = target_account['ending'] - spaced = ' '.join(list(ending)) - acc_type = target_account['type'] - # Build a tolerant regex: type followed by either compact or spaced ending or XXX### - import re as _re - name_regex = _re.compile(rf"{_re.escape(acc_type)}.*({_re.escape(ending)}|{_re.escape(spaced)}|XXX{_re.escape(ending)})", _re.I) - # Prefer within a visible listbox if present - option_locator = page.locator('[role="listbox"] [role="option"]').filter(has_text=name_regex) - count = await option_locator.count() - if count == 0: - # Fallback to any role=option in document - option_locator = page.locator('[role="option"]').filter(has_text=name_regex) - count = await option_locator.count() - if count > 0: - target_opt = option_locator.first - try: - await target_opt.scroll_into_view_if_needed() - except Exception: - pass - try: - async with page.expect_navigation(wait_until='domcontentloaded', timeout=15000): - await target_opt.click() - except Exception: - await target_opt.click(force=True) - try: - await page.wait_for_load_state('domcontentloaded', timeout=8000) - except Exception: - pass - # Verify header reflects new selection - try: - header_after = await page.evaluate('''() => { - const sel = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return sel ? (sel.textContent || '').trim() : ''; - }''') - except Exception: - header_after = '' - if header_after and acc_type.lower() in header_after.lower() and (ending in header_after or spaced in header_after): - if debug: - print("DEBUG: Role=option click succeeded; account appears selected") - account_clicked = True - # Close dropdown best-effort - try: - await page.keyboard.press('Escape') - await page.wait_for_timeout(200) - except Exception: - pass - # short settle - await page.wait_for_timeout(300) - else: - if debug: - print("DEBUG: Role=option click did not verify selection; falling back to element strategies") - except Exception as e: - if debug: - print(f"DEBUG: Role=option strategy failed: {e}") - - # Try to find and click the target account option - # Get all potential account elements - dropdown_candidates = await page.evaluate(''' - () => { - const selectors = [ - '[role="menu"]', - '[role="listbox"]', - '[role="dialog"]', - '[class*="dropdown"]', - '[class*="menu"]', - '[class*="overlay"]', - '[class*="modal"]', - '[class*="account"]', - '[class*="selector"]', - 'div[style*="position: absolute"]', - 'div[style*="z-index"]' - ]; - - const candidates = []; - for (const selector of selectors) { - try { - const elements = document.querySelectorAll(selector); - elements.forEach((elem, i) => { - if (elem.offsetParent !== null && elem.offsetWidth > 0 && elem.offsetHeight > 0) { - const text = (elem.textContent || elem.innerText || '').trim(); - const hasAccountPattern = ( - text.includes('ending in') || - /…\\d{3,4}|XXX\\d{3,4}|\\.\\.\\.\\d{3,4}/.test(text) || - (/joint|ira|individual|bogle|account/i.test(text) && /\\d{3}/.test(text)) - ); - - if (text.length > 10 && hasAccountPattern) { - candidates.push({ - selector: selector, - index: i, - text: text.substring(0, 200), - score: hasAccountPattern ? 1 : 0 - }); - } - } - }); - } catch (e) { - // Skip problematic selectors - } - } - return candidates.sort((a, b) => b.score - a.score); - } - ''') - - if not dropdown_candidates: - if debug: - print("DEBUG: No dropdown candidates found") - return False - - # Use the first candidate which actually contains account text - chosen = None - for cand in dropdown_candidates: - try: - els = await page.query_selector_all(cand['selector']) - if not els or len(els) <= cand['index']: - continue - el = els[cand['index']] - txt = await el.text_content() - if txt and ('ending in' in txt or re.search(r'\d \d \d', txt) or re.search(r'XXX\d{3,4}', txt) or 'Account Selector' in txt): - chosen = cand - break - except Exception: - continue - if not chosen: - chosen = dropdown_candidates[0] - - dropdown_selector = chosen['selector'] - dropdown_index = chosen['index'] - - if debug: - print(f"DEBUG: Using dropdown selector: {dropdown_selector}, index: {dropdown_index}") - - dropdown = await page.query_selector_all(dropdown_selector) - if not dropdown or len(dropdown) <= dropdown_index: - if debug: - print("DEBUG: Dropdown element not found") - return False - - dropdown = dropdown[dropdown_index] - - # Get all account elements in the dropdown - account_elements = await dropdown.query_selector_all('button, a, [role="option"], li, div, span') - - if debug: - print(f"DEBUG: Found {len(account_elements)} account elements in dropdown") - - # ENHANCED DEBUGGING: Log all account elements and their text - if debug: - for i, elem in enumerate(account_elements): - try: - text = await elem.inner_text() - if text and len(text.strip()) >= 3: - print(f"DEBUG: Account element {i}: '{text[:100]}'") - except Exception as e: - print(f"DEBUG: Error getting text from element {i}: {e}") - - account_clicked = False - for elem in account_elements: - try: - text = await elem.inner_text() - if not text or len(text.strip()) < 3: - continue - - # Parse the account text - parsed = parse_account_text(text) - if not parsed: - continue - - # ENHANCED DEBUGGING: Only log the target account match - if (parsed['label'] == target_account['label'] or - parsed['ending'] == target_account['ending']): - - if debug: - print(f"DEBUG: Found target account: {parsed['label']}") - - # ENHANCED DEBUGGING: Take screenshot before clicking - if debug: - try: - png = await page.screenshot(full_page=True) - save_debug_artifact(f"debug_before_account_click_{account_query}_target_{target_account['label']}.png", png) - except Exception as e: - pass - - # Prefer clicking a truly clickable ancestor (button/a/role=option/menuitem) - try: - clickable = await page.evaluate_handle('''(el) => { - let e = el; - for (let i = 0; i < 6 && e; i++) { - const role = (e.getAttribute && e.getAttribute('role')) || ''; - const tag = (e.tagName || '').toUpperCase(); - if (tag === 'BUTTON' || tag === 'A' || role === 'option' || role === 'menuitem') return e; - e = e.parentElement; - } - return el; - }''', elem) - except Exception: - clickable = elem - - # If there's an anchor ancestor with href, navigate directly as a first-class strategy - if not account_clicked: - try: - # Try to find a nearest anchor and interact depending on href - anchor_handle = await page.evaluate_handle('''(el) => { - function findAnchor(node){ - let e = node; - for (let i = 0; i < 6 && e; i++) { - if (e.tagName && e.tagName.toUpperCase() === 'A' && (e.href || e.getAttribute('href'))) return e; - e = e.parentElement; - } - return null; - } - return findAnchor(el); - }''', elem) - if anchor_handle: - try: - href = await page.evaluate('(a) => a.getAttribute("href") || a.href || ""', anchor_handle) - except Exception: - href = '' - if href and isinstance(href, str) and not href.lower().startswith('javascript'): - if debug: - print(f"DEBUG: Navigating directly to account URL: {href}") - try: - await page.goto(href, timeout=30000) - await page.wait_for_selector('.sdps-page-header__account-selector, #account-selector', timeout=15000) - account_clicked = True - except Exception as e: - if debug: - print(f"DEBUG: Direct navigation failed: {e}") - else: - # Fallback: simulate a native click on the anchor to trigger SPA handler - if debug: - print("DEBUG: Clicking javascript: anchor to trigger SPA selection") - try: - await page.evaluate('(a) => { a.click(); }', anchor_handle) - # Brief wait to allow SPA to process - await page.wait_for_timeout(500) - except Exception as e: - if debug: - print(f"DEBUG: Anchor click via JS failed: {e}") - except Exception as e: - if debug: - print(f"DEBUG: Anchor search/click failed: {e}") - - # ENHANCED FIX: Try multiple click strategies for visibility issues - click_success = False - async def _click_with_nav(action_desc: str, click_fn): - nonlocal click_success - try: - # Many times selecting an account triggers a navigation/reload. - # Set up the navigation expectation BEFORE triggering the click. - try: - async with page.expect_navigation(wait_until='domcontentloaded', timeout=15000): - await click_fn() - click_success = True - if debug: - print(f"DEBUG: Click with navigation succeeded ({action_desc})") - return - except Exception as nav_err: - # No navigation captured; fallback to plain click and wait - if debug: - print(f"DEBUG: No navigation captured ({action_desc}): {nav_err}") - await click_fn() - try: - await page.wait_for_load_state('domcontentloaded', timeout=2000) - except Exception: - pass - await page.wait_for_timeout(400) - click_success = True - if debug: - print(f"DEBUG: Click without navigation succeeded ({action_desc})") - except Exception as e: - if debug: - print(f"DEBUG: Click attempt failed ({action_desc}): {e}") - - # Strategy 1: Enhanced scroll and force click - try: - # Pre-scroll to element then click with navigation capture - await page.evaluate('(element) => element.scrollIntoView({behavior: "smooth", block: "center"})', clickable) - await page.wait_for_timeout(200) - await _click_with_nav("scroll+force", lambda: clickable.click(force=True)) - except Exception as e: - if debug: - print(f"DEBUG: Enhanced scroll + force click failed: {e}") - - # Strategy 2: Multiple scroll strategies - if not click_success: - try: - # Try different scroll positions - await page.evaluate('(element) => { const rect = element.getBoundingClientRect(); window.scrollTo(rect.left, rect.top - 100); }', clickable) - await page.wait_for_timeout(200) - await _click_with_nav("pos-scroll+force", lambda: clickable.click(force=True)) - if click_success and debug: - print(f"DEBUG: Click successful with position scroll") - except Exception as e: - if debug: - print(f"DEBUG: Position scroll + click failed: {e}") - - # Strategy 3: Make element visible then click - if not click_success: - try: - await page.evaluate('(element) => { element.style.visibility = "visible"; element.style.display = "block"; element.style.opacity = "1"; element.scrollIntoView({block: "center"}); }', clickable) - await page.wait_for_timeout(200) - await _click_with_nav("make-visible+force", lambda: clickable.click(force=True)) - if click_success and debug: - print(f"DEBUG: Click successful after making visible") - except Exception as e: - if debug: - print(f"DEBUG: Make visible + click failed: {e}") - - # Strategy 4: JavaScript click with enhanced parameters - if not click_success: - try: - await _click_with_nav( - "dispatchEvent(MouseEvent)", - lambda: page.evaluate('''(element) => { - element.dispatchEvent(new MouseEvent("click", {bubbles: true, cancelable: true, view: window})); - }''', clickable) - ) - if click_success and debug: - print(f"DEBUG: Click successful with MouseEvent") - except Exception as e: - if debug: - print(f"DEBUG: MouseEvent click failed: {e}") - - # Strategy 5: Hover then multiple click attempts - if not click_success: - try: - await clickable.hover(timeout=2000) - await page.wait_for_timeout(150) - # Try multiple rapid clicks with nav capture on first - try: - await _click_with_nav("hover+rapid-1", lambda: clickable.click(force=True)) - except Exception: - pass - if not click_success: - for attempt in range(2): - try: - await clickable.click(force=True) - await page.wait_for_timeout(100) - click_success = True - break - except: - continue - if click_success and debug: - print(f"DEBUG: Click successful with hover + rapid clicks") - except Exception as e: - if debug: - print(f"DEBUG: Hover + rapid clicks failed: {e}") - - # Strategy 6: Coordinate click on element's bounding box within its scrollable container - if not click_success: - try: - # Try to scroll the nearest scrollable ancestor to reveal element - try: - await page.evaluate('''(el) => { - function findScrollable(node){ - let e = node; - for (let i=0; i<8 && e; i++){ - const style = getComputedStyle(e); - if (/(auto|scroll)/.test(style.overflowY)) return e; - e = e.parentElement; - } - return null; - } - const sc = findScrollable(el) || document.scrollingElement || document.body; - const r = el.getBoundingClientRect(); - const scRect = sc.getBoundingClientRect ? sc.getBoundingClientRect() : {top:0,left:0,height:window.innerHeight}; - const targetY = r.top + (r.height/2) - (scRect.height/2); - try { sc.scrollBy({ top: targetY, behavior: 'auto' }); } catch(_) { sc.scrollTop += targetY; } - }''', clickable) - except Exception: - pass - # Compute viewport coordinates and click - bbox = await clickable.bounding_box() - if not bbox: - # Fallback to DOM rect - rect = await page.evaluate('(el) => { const r = el.getBoundingClientRect(); return {x:r.left, y:r.top, width:r.width, height:r.height}; }', clickable) - bbox = rect - if bbox and bbox['width'] > 2 and bbox['height'] > 2: - x = bbox['x'] + bbox['width']/2 - y = bbox['y'] + bbox['height']/2 - await page.mouse.move(x, y) - await page.mouse.click(x, y) - await page.wait_for_timeout(600) - click_success = True - if debug: - print("DEBUG: Coordinate click attempted on target element") - except Exception as e: - if debug: - print(f"DEBUG: Coordinate click failed: {e}") - - if click_success: - # Mark as clicked BEFORE waiting, since navigation may occur - account_clicked = True - if debug: - print(f"DEBUG: Account option clicked; waiting for potential navigation/reload") - try: - # Try to catch a navigation if it occurs - try: - async with page.expect_navigation(timeout=5000): - pass # If a navigation was already triggered by the click, this may catch it - except Exception: - # No navigation event captured; proceed with load-state wait - pass - try: - await page.wait_for_load_state('domcontentloaded', timeout=8000) - except Exception: - pass - await page.wait_for_timeout(500) - except Exception as e: - if debug: - print(f"DEBUG: Post-click wait encountered exception: {e}") - if debug: - print(f"DEBUG: Click sequence complete for {target_account['label']}") - break - else: - if debug: - print(f"DEBUG: All click strategies failed for account: {parsed['label']}") - - except Exception as e: - if debug: - print(f"DEBUG: Error processing account element: {e}") - continue - - if not account_clicked: - if debug: - print(f"DEBUG: Could not click target via element strategies; attempting keyboard navigation") - # Attempt keyboard navigation on the account selector - try: - # Re-open selector to ensure focus is on the dropdown - await page.evaluate('''() => { - const btn = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - if (btn && btn.click) btn.click(); - }''') - await page.wait_for_timeout(500) - except Exception: - pass - - # Determine current selection from header - try: - header_text = await page.evaluate('''() => { - const sel = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return sel ? (sel.textContent || '').trim() : ''; - }''') - except Exception: - header_text = '' - - current_parsed = parse_account_text(header_text) if header_text else None - current_label = current_parsed['label'] if current_parsed else None - - # Compute index positions - def _find_index(label: str) -> int: - for i, acc in enumerate(accounts): - if label == acc['label']: - return i - # fallback by ending - if label and 'XXX' in label: - ending = label.split('XXX')[-1] - for i, acc in enumerate(accounts): - if acc['ending'] == ending: - return i - return -1 - - current_index = _find_index(current_label) if current_label else -1 - target_index = _find_index(target_account['label']) - - if debug: - print(f"DEBUG: Keyboard nav indices - current: {current_index}, target: {target_index}") - - try: - # Focus the account selector button - btn = page.locator('#account-selector').first - if await btn.count() == 0: - btn = page.locator('.sdps-page-header__account-selector').first - try: - await btn.focus() - except Exception: - pass - - # Open dropdown via keyboard if needed - try: - await page.keyboard.press('Enter') - await page.wait_for_timeout(200) - except Exception: - pass - - # If indices are known, compute steps; else scan downwards up to N - max_steps = max(len(accounts) + 5, 10) - if current_index >= 0 and target_index >= 0: - steps = target_index - current_index - key = 'ArrowDown' if steps >= 0 else 'ArrowUp' - for _ in range(abs(steps)): - await page.keyboard.press(key) - await page.wait_for_timeout(120) - else: - # Blind scan - for _ in range(max_steps): - await page.keyboard.press('ArrowDown') - await page.wait_for_timeout(80) - - # Confirm selection - await page.keyboard.press('Enter') - await page.wait_for_timeout(300) - - # Verify header updated - try: - await page.wait_for_load_state('domcontentloaded', timeout=5000) - except Exception: - pass - try: - new_header_text = await page.evaluate('''() => { - const sel = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return sel ? (sel.textContent || '').trim() : ''; - }''') - except Exception: - new_header_text = '' - - if new_header_text and target_account['type'].lower() in new_header_text.lower() and target_account['ending'] in new_header_text: - if debug: - print("DEBUG: Keyboard navigation succeeded; account appears selected") - account_clicked = True - else: - if debug: - print("DEBUG: Keyboard navigation did not confirm selection") - except Exception as e: - if debug: - print(f"DEBUG: Keyboard navigation failed: {e}") - - if not account_clicked: - if debug: - print(f"DEBUG: All primary switch methods failed for {account_query}, attempting Summary page fallback...") - - try: - # Go to summary page if not already there - if "accounts/summary" not in page.url: - await page.goto("https://client.schwab.com/app/accounts/summary/#/") - await page.wait_for_timeout(5000) - - # Find the row for this account in the summary table and click its link - clicked_summary = await page.evaluate(""" - (query) => { - const rows = Array.from(document.querySelectorAll('sdps-table-row, tr')); - const targetRow = rows.find(r => r.innerText.includes(query) || r.textContent.includes(query)); - if (targetRow) { - const link = targetRow.querySelector('a.acctNavigate-button-link'); - if (link) { - link.click(); - return true; - } - } - return false; - } - """, account_query) - - if clicked_summary: - if debug: - print(f"DEBUG: Successfully clicked account {account_query} on summary page") - await page.wait_for_timeout(5000) - return True - except Exception as summary_err: - if debug: - print(f"DEBUG: Summary page fallback failed: {summary_err}") - - if debug: - print(f"DEBUG: Could not find and click/select target account: {target_account['label']}") - print(f"DEBUG: Target account details: {target_account}") - - # Close dropdown (best-effort) - try: - if not page.is_closed(): - await page.keyboard.press('Escape') - await page.wait_for_timeout(300) - await page.click('body') - await page.wait_for_timeout(500) - except Exception: - pass - - # CRITICAL: Verify the account switch actually worked using the same logic as the working test - if account_clicked: - if debug: - print("DEBUG: Verifying account switch actually worked...") - - # Wait for UI to update - try: - await page.wait_for_load_state('domcontentloaded', timeout=8000) - except Exception: - pass - await page.wait_for_timeout(500) - - # Get the current active account using the same method as the working test - try: - current_active_account = await page.evaluate(''' - () => { - const selector = document.querySelector('.sdps-page-header__account-selector'); - return selector ? selector.textContent?.trim() : null; - } - ''') - except Exception: - current_active_account = None - - if debug and current_active_account: - print(f"DEBUG: Current active account after switch: {current_active_account[:100]}...") - - # Use the SAME verification logic as the working test script - # Check if the account text actually contains our target - account_switch_verified = ( - target_account['type'].lower() in current_active_account.lower() and - target_account['ending'] in current_active_account - ) - - if account_switch_verified: - if debug: - print("✅ SUCCESS: Account switch VERIFIED - target account is now active") - print(f"✅ Found {target_account['type']} and {target_account['ending']} in account text") - return True - else: - if debug: - print("❌ FAILURE: Account switch failed verification - target account not active") - print(f"❌ Expected: {target_account['type']} and {target_account['ending']}") - print(f"❌ Got: {current_active_account[:100]}...") - return False - else: - if debug: - print("❌ FAILURE: Could not verify account switch - no active account found") - return False - else: - if debug: - print("❌ FAILURE: Could not click target account") - return False - - except Exception as e: - if debug: - print(f"DEBUG: Exception in switch_account_on_page: {e}") - import traceback - print(f"DEBUG: Full traceback: {traceback.format_exc()}") - return False - - -async def perform_export_download_enhanced(page, time_period: Optional[str] = None, account: Optional[str] = None, debug: bool = False, context=None, preserve_filename: bool = True) -> Dict[str, Any]: - """Enhanced export function with reliable account switching and filename preservation. - - Args: - page: Playwright page object - time_period: Time period for export (e.g., "Current Month") - account: Account identifier to switch to before export - debug: Enable debug output - context: Browser context for page recovery - preserve_filename: If True, save with original Schwab filename - - Returns: - Dict containing export results and metadata - """ - if debug: - print("DEBUG: Starting enhanced export download...") - - try: - # Ensure we're on the history page - await goto_history(page, context=context, debug=debug) - - # Switch to target account if specified - if account: - if debug: - print(f"DEBUG: Attempting to switch to account: {account}") - - success = await switch_account_with_verification(page, account, debug=debug) - if not success: - error_msg = f"Failed to switch to account '{account}'. Please manually select the correct account and retry." - if debug: - print(f"DEBUG: {error_msg}") - return { - "error": error_msg, - "account_requested": account, - "success": False - } - - # Open export panel - if debug: - print("DEBUG: Opening export panel...") - await open_export_panel(page, debug=debug) - - # Wait for export dialog - await page.wait_for_timeout(2000) - - # Find export dialog - export_dialog = None - dialogs = await page.query_selector_all("div[role='dialog']") - - for i, dialog in enumerate(dialogs): - try: - dialog_id = await dialog.get_attribute('aria-labelledby') - dialog_body_id = await dialog.get_attribute('aria-describedby') - - if (dialog_id and 'export-transactions' in dialog_id) or \ - (dialog_body_id and 'export-transactions' in dialog_body_id): - export_dialog = dialog - if debug: - print(f"DEBUG: Found export transactions dialog by ID") - break - except: - pass - - # Also check dialog content - try: - dialog_text = await dialog.inner_text() - if any(keyword in dialog_text.lower() for keyword in ['export transactions', 'csv', 'download']): - export_dialog = dialog - if debug: - print(f"DEBUG: Found export dialog by content") - break - except: - pass - - if not export_dialog: - return { - "error": "Could not find export dialog", - "success": False - } - - # Configure export settings - if time_period: - try: - await select_time_period(page, time_period, container=export_dialog, debug=debug) - except Exception as e: - if debug: - print(f"DEBUG: Time period selection failed: {e}") - - try: - await ensure_csv_format(page, container=export_dialog, debug=debug) - except Exception as e: - if debug: - print(f"DEBUG: CSV format selection failed: {e}") - - # Find and click export button - export_selectors = [ - "button:has-text('Export')", - "button[aria-label*='export']", - "button[aria-label*='Export']", - "input[type='submit'][value*='Export']", - "button:has-text('Download')", - ".export-button", - "[data-testid*='export']" - ] - - export_btn = None - for selector in export_selectors: - try: - btn = await export_dialog.query_selector(selector) - if btn and await btn.is_visible(): - export_btn = btn - if debug: - print(f"DEBUG: Found export button with selector: {selector}") - break - except Exception: - continue - - if not export_btn: - return { - "error": "Could not find export button in dialog", - "success": False - } - - # Set up download handler and click export - download_promise = page.wait_for_event('download') - - try: - await export_btn.click(force=True) - if debug: - print("DEBUG: Export button clicked (force)") - except Exception as e1: - if debug: - print(f"DEBUG: Force click failed: {e1}") - try: - await export_btn.evaluate("element => element.click()") - if debug: - print("DEBUG: Export button clicked (JS)") - except Exception as e2: - if debug: - print(f"DEBUG: JS click also failed: {e2}") - return { - "error": "Failed to click export button", - "success": False - } - - # Wait for download - try: - download = await asyncio.wait_for(download_promise, timeout=30) - - # Save the download - suggested_filename = download.suggested_filename - if preserve_filename: - download_path = f"./{suggested_filename}" - else: - # Use timestamp-based filename - from datetime import datetime - download_path = f"./export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" - - await download.save_as(download_path) - - # Get file info - import os - file_size = os.path.getsize(download_path) - - # Parse filename info - filename_info = parse_suggested_filename(suggested_filename) if suggested_filename else {} - - result = { - "success": True, - "filename": suggested_filename, - "saved_path": download_path, - "file_size": file_size, - "account_info": filename_info, - "time_period": time_period, - "account_requested": account - } - - if debug: - print(f"DEBUG: ✅ Export successful!") - print(f"DEBUG: Filename: {suggested_filename}") - print(f"DEBUG: Saved to: {download_path}") - print(f"DEBUG: File size: {file_size:,} bytes") - - return result - - except asyncio.TimeoutError: - return { - "error": "Download timeout - export may have failed", - "success": False - } - except Exception as e: - return { - "error": f"Download failed: {str(e)}", - "success": False - } - - except Exception as e: - if debug: - print(f"DEBUG: Exception in perform_export_download_enhanced: {e}") - return { - "error": f"Export failed: {str(e)}", - "success": False - } - - -async def perform_export_download(page, time_period: Optional[str] = None, account: Optional[str] = None, debug: bool = False, context=None) -> Dict[str, Any]: - if debug: - print("DEBUG: Navigating to history page…") - # If the page was closed due to prior actions, reopen it - try: - if page.is_closed(): - if context is None: - raise Exception("Playwright page is closed and no context provided to recover") - from ...browser.client import new_page - page = await new_page(context) - except Exception: - pass - await goto_history(page, context=context, debug=debug) - - # Check current account but DO NOT attempt switching to avoid context closure - current_account_info = None - if account and debug: - try: - current_account = await page.evaluate(''' - () => { - const header = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - return header ? (header.textContent || '').trim() : ''; - } - ''') - print(f"DEBUG: Current account text: {current_account[:200]}") - - # Parse current account info without switching - target_ending = account[-3:] if len(account) >= 3 else account - account_type = None - if "_XXX" in account: - account_type = account.split("_XXX")[0].replace("_", " ") - - # Check if selected account matches target by parsing the "Selected" portion - account_matched = False - if "Selected" in current_account: - selected_portion = current_account.split("Selected")[0] - if debug: - print(f"DEBUG: Currently selected portion: '{selected_portion}'") - - # More robust matching logic - if account_type and target_ending: - type_match = account_type.lower() in selected_portion.lower() - ending_match = (target_ending in selected_portion or - f"ending in {' '.join(target_ending)}" in selected_portion.lower()) - account_matched = type_match and ending_match - elif target_ending: - account_matched = (target_ending in selected_portion or - f"ending in {' '.join(target_ending)}" in selected_portion.lower()) - else: - # Fallback to substring match for account type only - account_matched = account.lower() in selected_portion.lower() - - current_account_info = { - 'text': current_account, - 'matched': account_matched, - 'target_type': account_type, - 'target_ending': target_ending - } - - if account_matched: - if debug: - print(f"DEBUG: ✅ Current account matches target {account}") - else: - if debug: - print(f"DEBUG: ⚠️ Current account does NOT match target {account}") - print(f"DEBUG: Target type: '{account_type}', ending: '{target_ending}'") - print(f"DEBUG: IMPORTANT: Account switching via UI causes browser crashes.") - print(f"DEBUG: The export will proceed and verify by filename. If wrong account,") - print(f"DEBUG: user will get clear instructions to manually select the correct account.") - - except Exception as e: - if debug: - print(f"DEBUG: Could not check current account: {e}") - print(f"DEBUG: Will proceed with export and verify by filename") - - # Perform export with verification and retry if filename doesn't match target account - max_export_attempts = 3 - last_meta = None - - for export_attempt in range(max_export_attempts): - if debug: - print(f"DEBUG: Export attempt {export_attempt + 1}/{max_export_attempts}…") - - # Ensure page still alive before continuing - page_closed = False - try: - page_closed = page.is_closed() - except Exception: - page_closed = True - if page_closed: - if context is None: - raise Exception("Playwright page is closed and no context provided to recover") - from ...browser.client import new_page - page = await new_page(context) - await goto_history(page, context=context, debug=debug) - # NOTE: We don't re-attempt account switching here anymore - # Account switching is done BEFORE the export loop to avoid page closure issues - - await open_export_panel(page, debug=debug) - # Scope to the export dialog for subsequent interactions - if debug: - print("DEBUG: Resolving export dialog…") - dialog = await _resolve_export_dialog(page, debug=debug) - if debug: - try: - png = await page.screenshot(full_page=True) - save_debug_artifact("debug_export_dialog_open.png", png) - except Exception: - pass - - # Ensure any dialog-level account selector also targets requested account - await _ensure_account_in_export_dialog(page, dialog, account, debug=debug) - await select_time_period(page, time_period, container=dialog, debug=debug) - await ensure_csv_format(page, container=dialog, debug=debug) - - # Re-verify account before download (header or dialog) - if account: - if debug: - print("DEBUG: Final account verification before download…") - pre_download_account = await page.evaluate(''' - () => { - const header = document.querySelector('.sdps-page-header__account-selector, #account-selector'); - const headerText = header ? (header.textContent || '').trim() : ''; - return headerText; - } - ''') - if debug: - print(f"DEBUG: Header account before download: {pre_download_account}") - # Try dialog scope too - try: - dialog_text = await dialog.evaluate('(root) => (root.textContent || "").trim().substring(0, 300)') - except Exception: - dialog_text = None - if debug and dialog_text: - print(f"DEBUG: Dialog account preview before download: {dialog_text[:120]}…") - - # Trigger download via the Export button inside the dialog - try: - async with page.expect_download(timeout=60000) as download_info: - await dialog.locator("button:has-text('Export')").first.click() - download = await download_info.value - except Exception: - # Fallback: try clicking any visible Export inside dialog with force - async with page.expect_download(timeout=60000) as download_info: - await dialog.locator("button:has-text('Export')").first.click(force=True) - download = await download_info.value - - suggested = download.suggested_filename - meta = parse_suggested_filename(suggested) - last_meta = meta - - if debug: - print("DEBUG: Download verification:") - print(f"DEBUG: Requested account: {account}") - print(f"DEBUG: Downloaded filename: {suggested}") - print(f"DEBUG: Parsed account from filename: {meta.get('label', 'Unknown')}") - - # Verify the downloaded filename corresponds to the requested account - if not account or _label_matches_account_query(account, meta.get('label', '')): - # Accept this download - temp_path = f"/tmp/{suggested}" - await download.save_as(temp_path) - with open(temp_path, 'rb') as f: - csv_content = f.read() - if debug: - print(f"DEBUG: Download complete: {suggested} -> {temp_path}") - return {"content": csv_content, "filename": suggested, "path": temp_path, **meta} - - # Mismatch: close dialog, re-verify account, and retry - if debug: - print("⚠️ WARNING: Downloaded filename doesn't match requested account; retrying export") - try: - await page.keyboard.press('Escape') - await page.wait_for_timeout(300) - await page.click('body') - except Exception: - pass - - # NOTE: We no longer attempt to switch accounts here as it causes page closure - # Account switching is done once BEFORE the export loop - # Just give UI time to settle before retry - await page.wait_for_timeout(1500) - - # As a last resort before next attempt, reload the history page - # The account selection should be preserved in the session - try: - await goto_history(page, context=context, debug=debug) - except Exception: - pass - - # If we reach here, all export attempts produced mismatched account files - current_label = (last_meta or {}).get('label', 'Unknown') - - # Enhanced error message with clear resolution steps - error_msg = f"""🚨 ACCOUNT MISMATCH: Wrong account transactions exported - -REQUESTED: {account} -EXPORTED: {current_label} - -🔧 SOLUTION - Manual Account Selection Required: - -Due to Schwab's website design, automatic account switching causes browser crashes. -Please follow these steps: - -1. 🌐 Open Schwab website manually: https://client.schwab.com -2. 📋 Navigate to: Accounts → History → Transactions -3. 🎯 Click the account selector dropdown (top of page) -4. ✅ Select the account: {account} -5. 🔄 Re-run the scraper (it will use the manually selected account) - -💡 WHY THIS HAPPENS: - - Schwab's account switching triggers complete page reloads - - This closes the browser automation session - - Manual selection before running scraper works perfectly - -📖 ALTERNATIVE: Use the account that's currently selected ({current_label})""" - - raise Exception(error_msg) diff --git a/schwab_scraper/features/transactions/service.py b/schwab_scraper/features/transactions/service.py deleted file mode 100644 index 2050386..0000000 --- a/schwab_scraper/features/transactions/service.py +++ /dev/null @@ -1,833 +0,0 @@ -from __future__ import annotations - -import json -from typing import Optional, Dict, Any, List -from datetime import datetime, timezone -import re - -from ...browser.auth import ensure_cookies -from ...core.config import load_config, get_playwright_url -from ...browser.client import connect, new_context, new_page -from ...browser.navigation import goto_with_auth_check -from .scraper import ( - perform_export_download, - perform_export_download_enhanced, - discover_accounts_from_page, - discover_accounts_with_numbers, -) -from .parser import parse_csv_content -from ...storage.cache import ( - write_cached_transaction_csv, - read_cached_transaction_csv, - TRANSACTION_CACHE_DIR, -) -from ...core.models import AccountInfo, TransactionData -from ...core import Envelope, ErrorType, fail, ok -import os - - -async def _get_transaction_history_enhanced_impl( - account: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - time_period: Optional[str] = None, - debug: bool = False, -) -> Envelope[TransactionData]: - """ - Enhanced export with reliable account switching and filename preservation. - - Args: - account: Account identifier (ending digits like '674', type like 'PLA Assets', or full label like 'PLA_Assets_XXX674'). - ✅ ENHANCED: Now supports reliable automatic account switching with verification. - start_date, end_date: Reserved for future "Custom" range support. - time_period: One of pre-defined periods (e.g., "Current Month", "Last 6 Months"). If None, uses page default. - debug: Enable debug logging and screenshots. - - Returns: - Dict with transaction data, account info, and export metadata. - """ - print("Starting enhanced transaction export...") - if debug: - print(f" Account: {account}") - print(f" Time period: {time_period}") - - # Load configuration and cookies - config = load_config() - playwright_url = get_playwright_url(config) - cookies = await ensure_cookies() - - if not cookies: - return fail( - "Could not establish session. Check credentials or manually refresh cookies.json.", - ErrorType.AUTHENTICATION, - retryable=False, - ) - - # Connect to browser - p, browser = await connect(playwright_url) - context = None - page = None - - try: - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - # Use the enhanced export function - export_result = await perform_export_download_enhanced( - page=page, - time_period=time_period, - account=account, - debug=debug, - context=context, - preserve_filename=True - ) - - if not export_result.get("success"): - # Try fallback to cached data - if account: - if debug: - print("Enhanced export failed, trying cached fallback...") - - # Determine account label for cache lookup - account_label = account - if account.isdigit(): - # Try to discover accounts to find full label - try: - accounts = await discover_accounts_with_numbers(page, debug=debug) - for acc in accounts: - if acc['ending'] == account[-3:]: - account_label = acc['label'] - break - except Exception: - pass - - cached_bytes = read_cached_transaction_csv(account_label) - if cached_bytes: - if debug: - print(f"Using cached data for {account_label}") - - # Parse the cached CSV bytes - records = parse_csv_content(cached_bytes) - - # Build account info from the label - account_type = account_label.split('_')[0] if '_' in account_label else "Unknown" - account_ending = account_label[-3:] if account_label[-3:].isdigit() else "000" - - data = TransactionData( - account_info=AccountInfo( - account_type=account_type, - account_ending=account_ending, - full_description=account_label, - is_selected=True, - ), - transactions=records, - date_range=time_period or "Unknown", - export_date="Unknown", - total_transactions=len(records), - source="cache", - ) - return ok(data) - - return fail( - export_result.get("error", "Enhanced export failed."), - ErrorType.UNKNOWN, - retryable=True, - ) - - # Parse the exported CSV - saved_path = export_result.get("saved_path") - if not saved_path or not os.path.exists(saved_path): - return fail("Export file not found after download", ErrorType.PARSING, retryable=True) - - with open(saved_path, 'r', encoding='utf-8') as f: - csv_content = f.read() - - parsed_data = parse_csv_content(csv_content.encode('utf-8')) - if not parsed_data: - return fail("Failed to parse CSV: No transactions found", ErrorType.PARSING, retryable=True) - - # Build response - account_info = export_result.get("account_info", {}) - transactions = parsed_data - - # Cache the results - if account_info.get("account_ending"): - account_label = f"{account_info.get('account_type', 'Unknown')}_XXX{account_info.get('account_ending')}" - try: - # Generate timestamp for filename - timestamp = datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S') - - # Convert transactions back to CSV format for caching - import csv - import io - - # Create CSV content from transactions - output = io.StringIO() - writer = csv.writer(output) - - # Write header - writer.writerow(["Date", "Action", "Symbol", "Description", "Quantity", "Price", "Fees & Comm", "Amount"]) - - # Write transaction data - for transaction in transactions: - writer.writerow([ - transaction.date, - transaction.action, - transaction.symbol or "", - transaction.description, - transaction.quantity or "", - transaction.price or "", - transaction.fees_comm or "", - transaction.amount or "" - ]) - - csv_bytes = output.getvalue().encode('utf-8') - write_cached_transaction_csv(account_label, timestamp, csv_bytes) - - if debug: - print(f"Cached transaction data for {account_label}") - except Exception as e: - if debug: - print(f"Failed to cache data: {e}") - - data = TransactionData( - account_info=AccountInfo( - account_type=account_info.get("account_type", "Unknown"), - account_ending=account_info.get("account_ending", "000"), - full_description=account_info.get("full_description", ""), - is_selected=account_info.get("is_selected", True), - ), - transactions=transactions, - date_range=time_period or "Unknown", - export_date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'), - total_transactions=len(transactions), - source="live", - ) - - if debug: - print(f"✅ Enhanced export successful: {len(transactions)} transactions") - - return ok(data) - - except Exception as e: - if debug: - print(f"Enhanced export exception: {e}") - import traceback - traceback.print_exc() - return fail(f"Enhanced export failed: {str(e)}", ErrorType.UNKNOWN, retryable=True) - - finally: - if page: - await page.close() - if context: - await context.close() - if browser: - await browser.close() - - -async def _ensure_cookies() -> Optional[List[Dict[str, Any]]]: - # Delegate to shared helper - return await ensure_cookies() - - -def _get_latest_cache_csv_filename(account_label: str) -> Optional[str]: - """Return the most recent CSV filename under the account's cache directory, if any.""" - import os - dir_path = os.path.join(TRANSACTION_CACHE_DIR, account_label) - if not os.path.isdir(dir_path): - return None - csv_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.csv')] - if not csv_files: - return None - # Sort by mtime if possible; fall back to lexical - try: - csv_files.sort(key=lambda f: os.path.getmtime(os.path.join(dir_path, f))) - except Exception: - csv_files.sort() - return csv_files[-1] - - -def _is_cache_fresh_for_label(account_label: str, max_age_hours: int = 24) -> bool: - """Return True if the most recent CSV for `account_label` is within `max_age_hours`.""" - import os, time - dir_path = os.path.join(TRANSACTION_CACHE_DIR, account_label) - if not os.path.isdir(dir_path): - return False - csv_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.csv')] - if not csv_files: - return False - # Use mtime (file creation/update time) to assess freshness - newest_path = max((os.path.join(dir_path, f) for f in csv_files), key=lambda p: os.path.getmtime(p)) - age_seconds = time.time() - os.path.getmtime(newest_path) - return age_seconds <= max_age_hours * 3600 - - -def _match_account_label_from_cache(account_query: Optional[str]) -> Optional[str]: - """Resolve a matching account label from cache directories given a query like '604' or 'PLA_Assets_XXX674'. - Only returns a label if a fresh (<=24h) CSV exists for that label. - """ - import os - if not os.path.isdir(TRANSACTION_CACHE_DIR): - return None - labels = [name for name in os.listdir(TRANSACTION_CACHE_DIR) - if os.path.isdir(os.path.join(TRANSACTION_CACHE_DIR, name))] - if not labels: - return None - - def label_matches(label: str, query: str) -> bool: - if not query: - return True - if query == label: - return True - # match by ending digits - if query.isdigit() and label.endswith(query): - return True - # substring match (e.g., 'PLA_Assets') - if query.lower() in label.lower(): - return True - return False - - # If no query provided: return latest fresh label if any - if not account_query: - fresh_labels = [lbl for lbl in labels if _is_cache_fresh_for_label(lbl)] - if not fresh_labels: - return None - fresh_labels.sort(key=lambda n: os.path.getmtime(os.path.join(TRANSACTION_CACHE_DIR, n)), reverse=True) - return fresh_labels[0] - - # Query provided: only return a matching fresh label - for lbl in labels: - if label_matches(lbl, account_query) and _is_cache_fresh_for_label(lbl): - return lbl - - # No fresh matching label - return None - - -async def _get_transaction_history_impl( - account: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - time_period: Optional[str] = None, - debug: bool = False, -) -> Envelope[TransactionData]: - """ - Export and parse transaction history for the selected account. - - Args: - account: Account identifier (ending digits like '604', name like 'Joint', or full label like 'PLA_Assets_XXX674'). - ⚠️ IMPORTANT: Due to Schwab's website design, automatic account switching causes browser crashes. - If the wrong account is selected, you'll get clear instructions to manually select the correct account first. - start_date, end_date: Reserved for future "Custom" range support. - time_period: One of pre-defined periods (e.g., "Current Month", "Last 6 Months"). If None, uses page default. - """ - # Basic input validation for optional custom date params - def _parse_date(date_str: str) -> Optional[datetime]: - # Accept YYYY-MM-DD or MM/DD/YYYY - if re.fullmatch(r"\d{4}-\d{2}-\d{2}", date_str): - try: - return datetime.strptime(date_str, "%Y-%m-%d") - except ValueError: - return None - if re.fullmatch(r"\d{2}/\d{2}/\d{4}", date_str): - try: - return datetime.strptime(date_str, "%m/%d/%Y") - except ValueError: - return None - return None - - if start_date: - start_dt = _parse_date(start_date) - if not start_dt: - return fail(f"Invalid start_date format: '{start_date}'. Use YYYY-MM-DD or MM/DD/YYYY.", ErrorType.VALIDATION, retryable=False) - else: - start_dt = None - - if end_date: - end_dt = _parse_date(end_date) - if not end_dt: - return fail(f"Invalid end_date format: '{end_date}'. Use YYYY-MM-DD or MM/DD/YYYY.", ErrorType.VALIDATION, retryable=False) - else: - end_dt = None - - if start_dt and end_dt and start_dt > end_dt: - return fail( - "start_date must be on or before end_date", - ErrorType.VALIDATION, - retryable=False, - ) - - cookies = await _ensure_cookies() - if not cookies: - account_label = _match_account_label_from_cache(account) - if account_label: - cached_bytes = read_cached_transaction_csv(account_label) - if cached_bytes: - records = parse_csv_content(cached_bytes) - export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC') - account_info = AccountInfo( - account_type=account_label.split('_')[0], - account_ending=account_label[-3:], - full_description=account_label, - is_selected=True, - ) - data = TransactionData( - account_info=account_info, - transactions=records, - date_range=time_period or "Cache", - export_date=export_date, - total_transactions=len(records), - source="cache", - ) - return ok(data) - return fail( - "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.", - ErrorType.AUTHENTICATION, - retryable=False, - ) - - config = load_config() - playwright_url = get_playwright_url(config) - - p, browser = await connect(playwright_url) - context = None - page = None - try: - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - try: - download = await perform_export_download( - page, - time_period=time_period, - account=account, - debug=debug, - context=context, - ) - csv_bytes = download["content"] - account_label = download["label"] - ts = download["ts"] - - # Cache - write_cached_transaction_csv(account_label, ts, csv_bytes) - - # Parse - records = parse_csv_content(csv_bytes) - - # Build metadata - export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC') - account_info = AccountInfo( - account_type=account_label.split('_')[0], - account_ending=account_label[-3:], - full_description=account_label, - is_selected=True, - ) - data = TransactionData( - account_info=account_info, - transactions=records, - date_range=time_period or "Page Default", - export_date=export_date, - total_transactions=len(records), - source="live", - ) - return ok(data) - except Exception as e: - # First failure: attempt one reconnect and retry, then fallback to cache - if debug: - try: - print(f"DEBUG: perform_export_download failed: {type(e).__name__}: {e}") - except Exception: - pass - # Attempt one reconnect if browser/context appears closed - try: - # Cleanup previous if possible - try: - if context is not None: - await context.close() - except Exception: - pass - try: - await browser.close() - except Exception: - pass - try: - await p.stop() - except Exception: - pass - - # Reconnect - p, browser = await connect(playwright_url) - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - # Retry export - if debug: - print("DEBUG: Retrying perform_export_download after reconnect...") - download = await perform_export_download( - page, - time_period=time_period, - account=account, - debug=debug, - context=context, - ) - csv_bytes = download["content"] - account_label = download["label"] - ts = download["ts"] - - # Cache - write_cached_transaction_csv(account_label, ts, csv_bytes) - - # Parse - records = parse_csv_content(csv_bytes) - - export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC') - account_info = AccountInfo( - account_type=account_label.split('_')[0], - account_ending=account_label[-3:], - full_description=account_label, - is_selected=True, - ) - data = TransactionData( - account_info=account_info, - transactions=records, - date_range=time_period or "Page Default", - export_date=export_date, - total_transactions=len(records), - source="live", - ) - return ok(data) - except Exception as e2: - if debug: - try: - print(f"DEBUG: Retry after reconnect failed: {type(e2).__name__}: {e2}") - except Exception: - pass - # Fall back to cache if available and fresh - account_label = _match_account_label_from_cache(account) - if account_label: - cached_bytes = read_cached_transaction_csv(account_label) - if cached_bytes: - records = parse_csv_content(cached_bytes) - export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC') - account_info = AccountInfo( - account_type=account_label.split('_')[0], - account_ending=account_label[-3:], - full_description=account_label, - is_selected=True, - ) - data = TransactionData( - account_info=account_info, - transactions=records, - date_range=time_period or "Cache", - export_date=export_date, - total_transactions=len(records), - source="cache", - ) - return ok(data) - return fail("Export failed and no fresh cache available", ErrorType.UNKNOWN, retryable=True) - - except Exception as e: - return fail(str(e), ErrorType.UNKNOWN, retryable=True) - - finally: - try: - if context is not None: - await context.close() - except Exception: - pass - try: - await browser.close() - except Exception: - pass - try: - await p.stop() - except Exception: - pass - - -def _get_cache_accounts(debug: bool = False) -> List[Dict[str, Any]]: - """Get accounts from cache directory fallback with enhanced validation.""" - from ...storage.cache import TRANSACTION_CACHE_DIR - import os - from datetime import datetime - - if not os.path.isdir(TRANSACTION_CACHE_DIR): - if debug: - print(f"DEBUG: Cache directory does not exist: {TRANSACTION_CACHE_DIR}") - return [] - - out = [] - cache_dirs = [] - - # Collect all cache directories with metadata - for name in os.listdir(TRANSACTION_CACHE_DIR): - path = os.path.join(TRANSACTION_CACHE_DIR, name) - if os.path.isdir(path): - try: - # Get directory modification time and file count - stat = os.stat(path) - csv_files = [f for f in os.listdir(path) if f.endswith('.csv')] - cache_dirs.append({ - 'name': name, - 'path': path, - 'mtime': stat.st_mtime, - 'csv_count': len(csv_files), - 'csv_files': csv_files - }) - except Exception as e: - if debug: - print(f"DEBUG: Error processing cache dir {name}: {e}") - continue - - # Sort by modification time (most recent first) to prioritize active accounts - cache_dirs.sort(key=lambda x: x['mtime'], reverse=True) - - if debug: - print(f"DEBUG: Found {len(cache_dirs)} cache directories") - - for cache_info in cache_dirs: - name = cache_info['name'] - csv_files = cache_info['csv_files'] - - if not csv_files: - if debug: - print(f"DEBUG: Skipping {name} - no CSV files") - continue - - try: - # Normalize using filename parser to ensure consistent label - normalized_label = name - account_type = None - account_ending = None - - # Strategy 1: Use directory name if it matches expected pattern - if re.match(r"^[A-Za-z_]+_XXX\d{3,4}$", name): - normalized_label = name - parts = name.split('_XXX') - account_type = parts[0].replace('_', ' ') - account_ending = parts[1] if len(parts) > 1 else name[-3:] - else: - # Strategy 2: Parse from most recent CSV filename - try: - from .scraper import parse_suggested_filename - latest_csv = sorted(csv_files)[-1] # Get most recent file - parsed_filename = parse_suggested_filename(latest_csv) - normalized_label = parsed_filename["label"] - - # Extract type and ending from parsed label - if '_XXX' in normalized_label: - parts = normalized_label.split('_XXX') - account_type = parts[0].replace('_', ' ') - account_ending = parts[1] if len(parts) > 1 else normalized_label[-3:] - except Exception as e: - if debug: - print(f"DEBUG: Failed to parse filename for {name}: {e}") - # Strategy 3: Fallback to directory name parsing - normalized_label = name - account_type = name - account_ending = name[-3:] if name[-3:].isdigit() else "000" - - # Validate the parsed data - if not account_ending or not account_ending.isdigit() or len(account_ending) < 3: - if debug: - print(f"DEBUG: Invalid account ending for {name}: {account_ending}") - continue - - # Create account entry - account_entry = { - "label": normalized_label, - "type": account_type or normalized_label.split('_')[0], - "ending": account_ending[-3:], # Ensure 3 digits - "cache_info": { - "last_updated": datetime.fromtimestamp(cache_info['mtime']).isoformat(), - "csv_count": cache_info['csv_count'] - } - } - - out.append(account_entry) - - if debug: - print(f"DEBUG: Added cache account: {normalized_label} ({account_type} ending {account_ending[-3:]}) - {cache_info['csv_count']} files") - - except Exception as e: - if debug: - print(f"DEBUG: Error processing cache account {name}: {e}") - continue - - if debug: - print(f"DEBUG: Successfully processed {len(out)} accounts from cache") - if not out: - print(f"DEBUG: Cache directory contents: {os.listdir(TRANSACTION_CACHE_DIR) if os.path.isdir(TRANSACTION_CACHE_DIR) else 'N/A'}") - - return out - - -async def _list_available_accounts_impl(debug: bool = False) -> List[Dict[str, Any]]: - """Return list of available accounts from live page when possible; fall back to cache with enhanced reliability.""" - if debug: - print("DEBUG: Starting account listing with enhanced discovery...") - - # Try live discovery with enhanced error handling - cookies = await _ensure_cookies() - if cookies: - if debug: - print("DEBUG: Session cookies available, attempting live account discovery...") - - config = load_config() - playwright_url = get_playwright_url(config) - p, browser = await connect(playwright_url) - context = None - page = None - try: - context = await new_context(browser, cookies=cookies) - page = await new_page(context) - - # Use centralized auth-aware navigation with retry - max_auth_attempts = 2 - auth_success = False - - for auth_attempt in range(max_auth_attempts): - if debug: - print(f"DEBUG: Authentication attempt {auth_attempt + 1}/{max_auth_attempts}...") - - auth_success = await goto_with_auth_check(page, context, "https://client.schwab.com/app/accounts/history/#/", debug=debug) - if auth_success: - break - elif auth_attempt < max_auth_attempts - 1: - if debug: - print("DEBUG: Authentication failed, retrying...") - await page.wait_for_timeout(3000) - - if not auth_success: - if debug: - print("DEBUG: All authentication attempts failed") - raise Exception("Authentication failed after multiple attempts") - - if debug: - print("DEBUG: Successfully authenticated, discovering accounts from live dropdown...") - - # Enhanced account discovery with fallback strategies - accounts = [] - - try: - accounts = await discover_accounts_from_page(page, debug=debug) - if debug: - print(f"DEBUG: Live account discovery returned {len(accounts)} accounts") - except Exception as e: - if debug: - print(f"DEBUG: Live account discovery failed: {e}") - accounts = [] - - # Enhanced result processing - if accounts: - if debug: - print(f"DEBUG: Successfully discovered {len(accounts)} accounts from live page:") - for acc in accounts: - print(f"DEBUG: - {acc['label']} ({acc['type']} ending {acc['ending']})") - - # Always try to enrich with cache data for completeness - cache_accounts = _get_cache_accounts(debug=debug) - if cache_accounts: - if debug: - print(f"DEBUG: Found {len(cache_accounts)} accounts in cache, merging...") - - # Merge live and cache, preferring live data but keeping unique cache entries - combined = {acc['ending']: acc for acc in cache_accounts} - live_endings = set() - - for live_acc in accounts: - combined[live_acc['ending']] = live_acc # Live data takes precedence - live_endings.add(live_acc['ending']) - - result = list(combined.values()) - if debug: - print(f"DEBUG: Final merged result: {len(result)} accounts") - for acc in result: - source = "live" if acc['ending'] in live_endings else "cache" - print(f"DEBUG: - {acc['label']} ({acc['type']} ending {acc['ending']}) [{source}]") - - return result - else: - if debug: - print("DEBUG: No cache data available, returning live accounts only") - return accounts - else: - if debug: - print("DEBUG: No accounts discovered from live page, falling back to cache only") - - except Exception as e: - if debug: - print(f"DEBUG: Live account discovery failed with error: {e}") - # Continue to cache fallback - - finally: - # Enhanced cleanup - cleanup_tasks = [] - if context is not None: - cleanup_tasks.append(context.close()) - if browser is not None: - cleanup_tasks.append(browser.close()) - if p is not None: - cleanup_tasks.append(p.stop()) - - for task in cleanup_tasks: - try: - await task - except Exception: - pass - else: - if debug: - print("DEBUG: No session cookies available, skipping live discovery") - - # Enhanced cache fallback - if debug: - print("DEBUG: Using cache-only fallback for account listing...") - - cache_accounts = _get_cache_accounts(debug=debug) - if cache_accounts: - if debug: - print(f"DEBUG: Successfully retrieved {len(cache_accounts)} accounts from cache") - return cache_accounts - else: - if debug: - print("DEBUG: No accounts found in cache either") - return [] - - -async def list_available_accounts(debug: bool = False) -> Envelope[List[Dict[str, Any]]]: - try: - accounts = await _list_available_accounts_impl(debug=debug) - return ok(accounts) - except Exception as exc: - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - - -async def get_transaction_history( - account: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - time_period: Optional[str] = None, - debug: bool = False, -) -> Envelope[TransactionData]: - return await _get_transaction_history_impl( - account=account, - start_date=start_date, - end_date=end_date, - time_period=time_period, - debug=debug, - ) - - -async def get_transaction_history_enhanced( - account: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - time_period: Optional[str] = None, - debug: bool = False, -) -> Envelope[TransactionData]: - return await _get_transaction_history_enhanced_impl( - account=account, - start_date=start_date, - end_date=end_date, - time_period=time_period, - debug=debug, - ) diff --git a/schwab_scraper/server/__init__.py b/schwab_scraper/server/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schwab_scraper/server/api.py b/schwab_scraper/server/api.py deleted file mode 100644 index 79ef06d..0000000 --- a/schwab_scraper/server/api.py +++ /dev/null @@ -1,74 +0,0 @@ -from fastapi import FastAPI, HTTPException -import asyncio -from schwab_scraper import unified_api -from schwab_scraper.core import Envelope - -app = FastAPI(title="Schwab Scraper API", version="0.1.0", description="REST API for Schwab Scraper via unified_api") -browser_lock = asyncio.Semaphore(1) - -async def check_success(envelope: Envelope): - if not envelope.get("success"): - raise HTTPException(status_code=400, detail=envelope.get("error", "Unknown error")) - return envelope.get("data") - -@app.get("/api/accounts", tags=["Accounts"]) -async def list_accounts(): - """List all available Schwab accounts.""" - async with browser_lock: - env = await unified_api.list_accounts() - return await check_success(env) - -@app.get("/api/accounts/overview", tags=["Accounts"]) -async def get_overview(account: str | None = None): - """Get a high level overview of an account or all accounts.""" - async with browser_lock: - env = await unified_api.get_account_overview(account) - return await check_success(env) - -@app.get("/api/accounts/positions", tags=["Accounts"]) -async def get_positions(account: str | None = None, include_non_equity: bool = False): - """Retrieve positions/holdings for an account.""" - async with browser_lock: - env = await unified_api.get_positions(account, include_non_equity=include_non_equity) - return await check_success(env) - -@app.get("/api/transactions", tags=["Transactions"]) -async def get_transactions( - account: str | None = None, - limit: int = 50, - days_back: int = 90 -): - """Fetch transaction history.""" - async with browser_lock: - env = await unified_api.get_transaction_history_enhanced( - account=account, limit=limit, days_back=days_back - ) - return await check_success(env) - -@app.get("/api/equity/morningstar/{ticker}", tags=["Research"]) -async def get_morningstar(ticker: str): - """Get Morningstar rating details for an equity.""" - async with browser_lock: - env = await unified_api.get_morningstar_data(ticker) - return await check_success(env) - -@app.get("/api/equity/phase1/{ticker}", tags=["Research"]) -async def get_equity_phase1(ticker: str): - """Fetch base Phase1 equity statistics (pricing, basic facts).""" - async with browser_lock: - env = await unified_api.get_equity_phase1_data(ticker) - return await check_success(env) - -@app.get("/api/session/status", tags=["System"]) -async def get_session_status(): - """Check if the cookies and session are currently valid.""" - async with browser_lock: - env = await unified_api.get_session_status() - return await check_success(env) - -def start(): - import uvicorn - uvicorn.run("schwab_scraper.server.api:app", host="0.0.0.0", port=8000, reload=True) - -if __name__ == "__main__": - start() diff --git a/schwab_scraper/server/mcp_server.py b/schwab_scraper/server/mcp_server.py deleted file mode 100644 index e7f34be..0000000 --- a/schwab_scraper/server/mcp_server.py +++ /dev/null @@ -1,79 +0,0 @@ -from mcp.server.fastmcp import FastMCP -from starlette.applications import Starlette -from starlette.routing import Route, Mount -from starlette.responses import JSONResponse -import uvicorn -import asyncio -import os -from schwab_scraper import unified_api - -# Note: Using the official mcp.server.fastmcp module (installed via pip mcp) -mcp = FastMCP("SchwabScraper", description="Schwab Scraper MCP Server for financial data") -browser_lock = asyncio.Semaphore(1) - -def unwrap(env): - if not env.get("success"): - raise Exception(f"Failed: {env.get('error')}") - return env.get("data") - -@mcp.tool() -async def get_session_status() -> dict: - """Get the current session status for the Schwab scraper.""" - async with browser_lock: - return unwrap(await unified_api.get_session_status()) - -@mcp.tool() -async def list_accounts() -> list: - """List all available Schwab accounts and mask IDs.""" - async with browser_lock: - accounts = unwrap(await unified_api.list_accounts()) - return [acc.model_dump() for acc in accounts] if accounts else [] - -@mcp.tool() -async def get_account_overview(account_id: str = None) -> dict: - """Get high level overview balances, equity, and metrics for a specific account or all accounts.""" - async with browser_lock: - overview = unwrap(await unified_api.get_account_overview(account_id)) - return overview.model_dump() if overview else {} - -@mcp.tool() -async def get_positions(account_id: str = None, include_non_equity: bool = False) -> list: - """Get specific stock, bond, or fund positions held in an account.""" - async with browser_lock: - pos = unwrap(await unified_api.get_positions(account_id, include_non_equity=include_non_equity)) - return [p.model_dump() for p in pos] if pos else [] - -@mcp.tool() -async def get_transactions(account_id: str = None, limit: int = 50, days_back: int = 90) -> list: - """Get transaction history (trades, dividends, transfers) for a specific account.""" - async with browser_lock: - tx = unwrap(await unified_api.get_transaction_history_enhanced(account_id, limit=limit, days_back=days_back)) - return [t.model_dump() for t in tx] if tx else [] - -@mcp.tool() -async def get_morningstar_data(ticker: str) -> dict: - """Get Morningstar research data for a specific ticker symbol (E.g. AAPL) directly from Schwab.""" - async with browser_lock: - data = unwrap(await unified_api.get_morningstar_data(ticker)) - return data.model_dump() if data else {} - - -# --- Blueprint Requirements: Health Check & ASGI App --- -async def health(request): - return JSONResponse({"status": "ok"}) - -def create_app(): - # If using mcp.server.fastmcp from 'mcp' package >= 1.2, it doesn't expose a clean Starlette - # mount utility like the old 'fastmcp' did. However, mcp.server.fastmcp exposes create_starlette_app() - # if using SSE transport module. We'll simply let FastMCP handle SSE natively and run Starlette only if needed, - # but the blueprint strictly wants Starlette wrapping. - # For newer SDKs, starlette_app is an internal property when running sse. - pass - -if __name__ == "__main__": - port = int(os.environ.get("PORT", 8000)) - # We use mcp.run directly rather than rolling a custom starlette wrapper, - # as the official SDK changed the mounting pattern since the blueprint was written. - # This automatically serves the SSE endpoints over HTTP and is standard. - # Note: FastMCP natively spins up uvicorn for us. - mcp.run(transport="sse", host="0.0.0.0", port=port) diff --git a/schwab_scraper/storage/__init__.py b/schwab_scraper/storage/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schwab_scraper/storage/cache.py b/schwab_scraper/storage/cache.py deleted file mode 100644 index c227024..0000000 --- a/schwab_scraper/storage/cache.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -from typing import Optional - -CACHE_DIR = "data/morningstar_pdfs" -TRANSACTION_CACHE_DIR = "data/transaction_csvs" - - -def ensure_cache_dir() -> str: - os.makedirs(CACHE_DIR, exist_ok=True) - return CACHE_DIR - - -def ensure_transaction_cache_dir() -> str: - os.makedirs(TRANSACTION_CACHE_DIR, exist_ok=True) - return TRANSACTION_CACHE_DIR - - -def cache_filename(ticker: str, formatted_date: str) -> str: - ensure_cache_dir() - # Sanitize date string to remove slashes that would create subdirectories - safe_date = formatted_date.replace('/', '_').replace('\\', '_') - return os.path.join(CACHE_DIR, f"{ticker.upper()}_{safe_date}.pdf") - - -def transaction_cache_filename(account_label: str, timestamp_str: str) -> str: - """Return a path like data/transaction_csvs//_Transactions_.csv - - account_label examples: "Joint_XXX604", "IRA_XXX873". Timestamp is usually YYYYMMDD-HHMMSS. - """ - ensure_transaction_cache_dir() - safe_label = account_label.replace("/", "_") - account_dir = os.path.join(TRANSACTION_CACHE_DIR, safe_label) - os.makedirs(account_dir, exist_ok=True) - return os.path.join(account_dir, f"{safe_label}_Transactions_{timestamp_str}.csv") - - -def read_cached_pdf(ticker: str) -> Optional[bytes]: - ensure_cache_dir() - files = [f for f in os.listdir(CACHE_DIR) if f.startswith(ticker.upper()) and f.endswith(".pdf")] - if not files: - return None - with open(os.path.join(CACHE_DIR, files[0]), "rb") as f: - return f.read() - - -def read_cached_transaction_csv(account_label: str) -> Optional[bytes]: - """Return latest cached CSV bytes for an account label, if any.""" - ensure_transaction_cache_dir() - safe_label = account_label.replace("/", "_") - account_dir = os.path.join(TRANSACTION_CACHE_DIR, safe_label) - if not os.path.isdir(account_dir): - return None - files = [f for f in os.listdir(account_dir) if f.endswith('.csv')] - if not files: - return None - # Pick most recent by name (timestamp in filename) - files.sort(reverse=True) - with open(os.path.join(account_dir, files[0]), 'rb') as f: - return f.read() - - -def write_cached_pdf(ticker: str, formatted_date: str, pdf_bytes: bytes) -> str: - ensure_cache_dir() - path = cache_filename(ticker, formatted_date) - with open(path, "wb") as f: - f.write(pdf_bytes) - return path - - -def write_cached_transaction_csv(account_label: str, timestamp_str: str, csv_bytes: bytes) -> str: - path = transaction_cache_filename(account_label, timestamp_str) - with open(path, 'wb') as f: - f.write(csv_bytes) - return path diff --git a/schwab_scraper/unified_api.py b/schwab_scraper/unified_api.py deleted file mode 100644 index 5350028..0000000 --- a/schwab_scraper/unified_api.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Unified Schwab data surface with envelope-based async endpoints.""" - -from __future__ import annotations - -from typing import Optional - -from .core import AccountOverview, AccountSummary, Envelope, MorningstarData, PortfolioSnapshot, Position, EquityPhase1Data -from .core.models import TransactionData -from .core import ErrorType, fail -from .features.accounts_positions.accounts_scraper import list_accounts as _list_accounts -from .features.accounts_positions.overview_scraper import get_account_overview as _get_account_overview -from .features.accounts_positions.positions_scraper import get_positions as _get_positions -from .features.accounts_positions.portfolio_scraper import get_portfolio_snapshot as _get_portfolio_snapshot -from .features.equity.service import get_morningstar_data as _get_morningstar_data, get_equity_phase1_data as _get_equity_phase1_data -from .features.transactions.service import ( - get_transaction_history as _get_transaction_history, - get_transaction_history_enhanced as _get_transaction_history_enhanced, - list_available_accounts as _list_available_accounts, -) -from .browser.session import get_session_status as _get_session_status_impl -from .browser.session import refresh_session as _refresh_session_impl -from .browser.session import set_cookies_from_file as _set_cookies_impl -from .browser.session import export_cookies as _export_cookies_impl - - -async def get_session_status(debug: bool = False) -> Envelope[dict]: - try: - status = await _get_session_status_impl(debug=debug) - return status # already returns envelope - except Exception as exc: - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - - -async def refresh_session(debug: bool = False) -> Envelope[None]: - try: - return await _refresh_session_impl(debug=debug) - except Exception as exc: - return fail(str(exc), ErrorType.UNKNOWN, retryable=True) - - -async def set_cookies(cookies_path: str, debug: bool = False) -> Envelope[None]: - try: - return await _set_cookies_impl(cookies_path, debug=debug) - except Exception as exc: - return fail(str(exc), ErrorType.UNKNOWN, retryable=False) - - -async def export_cookies(cookies_path: str, debug: bool = False) -> Envelope[None]: - try: - return await _export_cookies_impl(cookies_path, debug=debug) - except Exception as exc: - return fail(str(exc), ErrorType.UNKNOWN, retryable=False) - - -async def list_accounts(debug: bool = False) -> Envelope[list[AccountSummary]]: - envelope = await _list_accounts(debug=debug) - if not envelope["success"]: - return envelope - data = envelope["data"] or [] - summaries: list[AccountSummary] = [] - for item in data: - if isinstance(item, AccountSummary): - summaries.append(item) - else: - summaries.append(AccountSummary(**item)) - return { - "success": True, - "data": summaries, - "error": None, - "error_type": None, - "retryable": False, - } - - -async def get_account_overview( - account: AccountSummary | str | None = None, - *, - debug: bool = False, -) -> Envelope[AccountOverview]: - if isinstance(account, dict): - account = AccountSummary(**account) - return await _get_account_overview(account=account, debug=debug) - - -async def get_positions( - account: AccountSummary | str | None = None, - *, - include_non_equity: bool = False, - debug: bool = False, -) -> Envelope[list[Position]]: - if isinstance(account, dict): - account = AccountSummary(**account) - return await _get_positions(account=account, include_non_equity=include_non_equity, debug=debug) - - -async def get_portfolio_snapshot( - account: AccountSummary | str | None = None, - *, - aggregate_by_symbol: bool = True, - include_non_equity: bool = False, - debug: bool = False, -) -> Envelope[PortfolioSnapshot]: - if isinstance(account, dict): - account = AccountSummary(**account) - return await _get_portfolio_snapshot( - account=account, - aggregate_by_symbol=aggregate_by_symbol, - include_non_equity=include_non_equity, - debug=debug, - ) - - -async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]: - return await _get_morningstar_data(ticker, debug=debug) - - -async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]: - """Get Phase 1 enhanced equity data for a ticker. - - Extracts: - - Quote/Price Data (symbol bar) - - Enhanced Dividend Information (forward-looking dates) - - Core Earnings Metrics (EPS, forecasts) - - Basic Valuation Ratios (P/E, Forward P/E, PEG) - - Calculated Metrics (payout ratio) - - Args: - ticker: Stock ticker symbol - debug: Enable debug logging - - Returns: - Envelope containing EquityPhase1Data or error - """ - return await _get_equity_phase1_data(ticker, debug=debug) - - -async def list_available_accounts(debug: bool = False) -> Envelope[list[dict]]: - return await _list_available_accounts(debug=debug) - - -async def get_transaction_history( - account: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - time_period: Optional[str] = None, - debug: bool = False, -) -> Envelope[TransactionData]: - envelope = await _get_transaction_history( - account=account, - start_date=start_date, - end_date=end_date, - time_period=time_period, - debug=debug, - ) - return envelope - - -async def get_transaction_history_enhanced( - account: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - time_period: Optional[str] = None, - debug: bool = False, -) -> Envelope[TransactionData]: - envelope = await _get_transaction_history_enhanced( - account=account, - start_date=start_date, - end_date=end_date, - time_period=time_period, - debug=debug, - ) - return envelope - -__all__ = [ - "get_session_status", - "refresh_session", - "set_cookies", - "export_cookies", - "list_accounts", - "get_account_overview", - "get_positions", - "get_portfolio_snapshot", - "get_morningstar_data", - "get_equity_phase1_data", - "list_available_accounts", - "get_transaction_history", - "get_transaction_history_enhanced", -] diff --git a/schwab_scraper/utils/__init__.py b/schwab_scraper/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schwab_scraper/utils/logging.py b/schwab_scraper/utils/logging.py deleted file mode 100644 index 92fcc64..0000000 --- a/schwab_scraper/utils/logging.py +++ /dev/null @@ -1,19 +0,0 @@ -import logging -import os -from datetime import datetime, timezone - - -def setup_logging(debug: bool = False) -> None: - level = logging.DEBUG if debug else logging.INFO - logging.basicConfig(level=level, format='%(asctime)s %(levelname)s %(name)s: %(message)s') - - -def save_debug_artifact(filename: str, content: str | bytes) -> str: - debug_dir = "debug" - os.makedirs(debug_dir, exist_ok=True) - timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") - path = os.path.join(debug_dir, f"{timestamp}_{filename}") - mode = 'wb' if isinstance(content, (bytes, bytearray)) else 'w' - with open(path, mode) as f: - f.write(content) # type: ignore[arg-type] - return path diff --git a/uv.lock b/uv.lock index ea667b3..1df4c65 100644 --- a/uv.lock +++ b/uv.lock @@ -1744,7 +1744,7 @@ requires-dist = [ [[package]] name = "schwab-scraper" version = "0.6.16" -source = { git = "ssh://git.local.ben.io/b3nw/schwab-scraper.git#dc87a5ba661082c655c75bb9cc9f4716baa9f6f4" } +source = { git = "ssh://git.local.ben.io/b3nw/schwab-scraper.git#f52774b40e2b21da79b85eef180644624a936a30" } dependencies = [ { name = "aiohttp" }, { name = "fastapi" },