import time from typing import Any, Dict, Optional import logging from ...core.config import load_config, get_playwright_url from ...browser.auth import ensure_cookies from ...browser.client import connect, new_context, new_page from ...browser.navigation import goto_with_auth_check from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok from .morningstar import find_report, download_report_as_bytes from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf from .parser import parse as parse_pdf from .scraper import extract_dividend_data from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach import re def extract_company_name_from_title(page_title: str, ticker: str): if not page_title: return None try: title = ( page_title.replace(" | Charles Schwab", "") .replace(" - Charles Schwab", "") .replace("Stock Quote & Summary", "") .replace("Stock Research", "") .replace("Research", "") .replace("- Research", "") ) pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)" match = re.match(pattern, title, re.IGNORECASE) if match: company_name = match.group(1).strip() company_name = company_name.replace(" -", "").strip() if len(company_name) > 1 and not company_name.isdigit(): return company_name for separator in [" |", " -"]: if separator in title: potential_name = title.split(separator)[0].strip() if potential_name.upper() != ticker.upper() and len(potential_name) > 1: return potential_name return None except Exception: return None async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]: """Get Phase 1 enhanced equity data for a ticker. Extracts: - Quote/Price Data (symbol bar) - Enhanced Dividend Information (forward-looking dates) - Core Earnings Metrics (EPS, forecasts) - Basic Valuation Ratios (P/E, Forward P/E, PEG) - Calculated Metrics (payout ratio) Args: ticker: Stock ticker symbol debug: Enable debug logging Returns: Envelope containing EquityPhase1Data or error """ ticker = ticker.upper() logger = logging.getLogger(__name__) if debug: logger.setLevel(logging.DEBUG) logger.debug(f"Starting get_equity_phase1_data for {ticker}") # Session management cookies = await ensure_cookies() if not cookies: return fail( "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.", ErrorType.AUTHENTICATION, retryable=False, ) config = load_config() playwright_url = get_playwright_url(config) # Browser orchestration context = None page = None p, browser = await connect(playwright_url) try: context = await new_context(browser, cookies=cookies) page = await new_page(context) # Navigate to stock research page timeout = 30000 if debug else 45000 success = await goto_with_auth_check( page, context, f"https://client.schwab.com/app/research/#/stocks/{ticker}", debug=debug, timeout=timeout, ) if not success: return fail( "Authentication failed while navigating to research page", ErrorType.AUTHENTICATION, retryable=True, ) # Validate ticker by checking for stock page content if debug: logger.debug(f"Current page URL: {page.url}") try: # Wait for stock-specific content to appear await page.wait_for_selector( 'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section', timeout=10000, state='visible' ) except Exception as wait_err: if debug: logger.debug(f"Timeout waiting for stock content: {wait_err}") return fail( f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", ErrorType.VALIDATION, retryable=False, ) # Validate content try: has_valid_content = await page.evaluate(''' () => { const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)'); if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) { return true; } const morningstarSection = document.querySelector('#morningstar-section'); if (morningstarSection) { return true; } return false; } ''') if not has_valid_content: return fail( f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", ErrorType.VALIDATION, retryable=False, ) except Exception as e: logger.debug(f"Error checking for valid content: {e}") return fail( f"Invalid ticker: {ticker}. Unable to validate ticker.", ErrorType.VALIDATION, retryable=False, ) # Extract Phase 1 data using improved DOM scraping # Note: API approach failed due to CORS restrictions phase1_data = await extract_phase1_data(page, debug=debug) return ok(phase1_data) finally: try: if page is not None: await page.close() except Exception: pass try: if context is not None: await context.close() except Exception: pass for handle in (browser,): try: if handle is not None: await handle.close() except Exception: pass try: if p is not None: await p.stop() except Exception: pass async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]: ticker = ticker.upper() ensure_cache_dir() logger = logging.getLogger(__name__) if debug: logger.setLevel(logging.DEBUG) logger.debug(f"Starting get_morningstar_data for {ticker}") # Session management cookies = await ensure_cookies() if not cookies: return fail( "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.", ErrorType.AUTHENTICATION, retryable=False, ) config = load_config() playwright_url = get_playwright_url(config) # Browser orchestration context = None page = None p, browser = await connect(playwright_url) try: context = await new_context(browser, cookies=cookies) page = await new_page(context) # Use shared auth-aware navigation helper for consistency # Use shorter timeout for tests to speed up execution timeout = 30000 if debug else 45000 success = await goto_with_auth_check( page, context, f"https://client.schwab.com/app/research/#/stocks/{ticker}", debug=debug, timeout=timeout, ) if not success: return fail( "Authentication failed while navigating to research page", ErrorType.AUTHENTICATION, retryable=True, ) # Validate ticker by checking for stock page content # Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid if debug: logger.debug(f"Current page URL: {page.url}") # Wait for page content to load - Schwab's research page loads asynchronously # Give it time to populate the DOM before validation try: # Wait for either company name or Morningstar section to appear # This indicates the page has loaded stock-specific content await page.wait_for_selector( 'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section', timeout=10000, state='visible' ) except Exception as wait_err: # If neither selector appears after 10 seconds, likely an invalid ticker if debug: logger.debug(f"Timeout waiting for stock content: {wait_err}") return fail( f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", ErrorType.VALIDATION, retryable=False, ) # Additional validation: check if we have valid stock page content try: has_valid_content = await page.evaluate(''' () => { // Look for company name span (valid stock pages have this) const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)'); if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) { return true; } // Look for Morningstar section (valid stock pages have this) const morningstarSection = document.querySelector('#morningstar-section'); if (morningstarSection) { return true; } // Look for company profile description (valid stock pages have this) const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout'); if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) { return true; } // Look for any stock-related content const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]'); if (stockContent) { return true; } return false; } ''') if debug: logger.debug(f"Valid stock content detected: {has_valid_content}") if not has_valid_content: if debug: logger.debug(f"Invalid ticker detected - no stock content found") return fail( f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.", ErrorType.VALIDATION, retryable=False, ) except Exception as e: logger.debug(f"Error checking for valid content: {e}") # If we can't check, assume invalid and return error return fail( f"Invalid ticker: {ticker}. Unable to validate ticker.", ErrorType.VALIDATION, retryable=False, ) # Company name - extract from page elements company_name = None try: # Strategy 1: Extract from company name span element company_name = await page.evaluate(''' () => { // Look for company name in title span const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)'); if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) { return nameSpan.textContent.trim(); } // Fallback: Extract from company profile description const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout'); if (profileText && profileText.textContent) { const text = profileText.textContent.trim(); // Extract company name before " designs" or " is" or " provides" const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i); if (match) { return match[1].trim(); } } return null; } ''') if debug and company_name: logger.debug(f"Extracted company name: {company_name}") except Exception as e: logger.debug(f"Company name extraction error: {e}") # Morningstar section wait try: await page.wait_for_selector('#morningstar-section', timeout=30000) except Exception: logger.debug("#morningstar-section not found within timeout") # Dividends try: dividend_data = await extract_dividend_data(page, debug=debug) except Exception as exc: logger.debug(f"Dividend extraction error: {exc}") dividend_data = {} # Find report and download/cache report_url, report_date = await find_report(page, debug=debug) data: Dict[str, Any] = {} if report_date: data["Morningstar Equity Report Date"] = report_date.strip() if report_url: # Only store actual URL, not the __CLICK_TO_OPEN__ marker if report_url != '__CLICK_TO_OPEN__': data["Morningstar Equity Report URL"] = report_url pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug) else: pdf_bytes = None parsed_data: Dict[str, Any] = {} if pdf_bytes: if report_date: from datetime import datetime try: dt = datetime.strptime(report_date, "%b %d, %Y") formatted_date = dt.strftime("%m-%d-%Y") except Exception: formatted_date = report_date.replace(" ", "-") else: formatted_date = time.strftime("%m-%d-%Y") write_cached_pdf(ticker, formatted_date, pdf_bytes) try: parsed_data = parse_pdf(pdf_bytes) parsed_data["source"] = "live" except Exception as exc: logger.debug(f"PDF parsing failed: {exc}") parsed_data = {"error": "Failed to parse Morningstar report"} else: cached = read_cached_pdf(ticker) if cached: try: parsed_data = parse_pdf(cached) parsed_data["source"] = "cache" except Exception as exc: logger.debug(f"Cached PDF parsing failed: {exc}") parsed_data = {"error": "Failed to parse cached Morningstar report"} else: parsed_data = {"error": f"Failed to download and no cache available for {ticker}"} morningstar = MorningstarData( ticker=ticker, company_name=company_name, previous_dividend_payment=dividend_data.get("Previous Dividend Payment"), previous_pay_date=dividend_data.get("Previous Pay Date"), previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"), frequency=dividend_data.get("Frequency"), annual_dividend_rate=dividend_data.get("Annual Dividend Rate"), annual_dividend_yield=dividend_data.get("Annual Dividend Yield"), fair_value=parsed_data.get("Fair Value"), economic_moat=parsed_data.get("Economic Moat"), capital_allocation=parsed_data.get("Capital Allocation"), rating=_safe_int(parsed_data.get("Morningstar Rating")), one_star_price=parsed_data.get("1-Star Price"), five_star_price=parsed_data.get("5-Star Price"), assessment=parsed_data.get("Assessment"), range_52_week=parsed_data.get("52-Week Range"), dividend_yield=parsed_data.get("Dividend Yield"), investment_style=parsed_data.get("Investment Style"), report_url=data.get("Morningstar Equity Report URL"), report_date=data.get("Morningstar Equity Report Date"), source=parsed_data.get("source"), ) if parsed_data.get("error"): return fail(parsed_data["error"], ErrorType.PARSING, retryable=True) return ok(morningstar) finally: try: if page is not None: await page.close() except Exception: pass try: if context is not None: await context.close() except Exception: pass for handle in (browser,): try: if handle is not None: await handle.close() except Exception: pass try: if p is not None: await p.stop() except Exception: pass def _safe_int(value: Any) -> Optional[int]: if value is None: return None try: return int(str(value).strip()) except (TypeError, ValueError): return None