schwab-mcp-custom/schwab_scraper/features/equity/service.py

import time
from typing import Any, Dict, Optional
import logging
from ...core.config import load_config, get_playwright_url
from ...browser.auth import ensure_cookies
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok
from .morningstar import find_report, download_report_as_bytes
from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf
from .parser import parse as parse_pdf
from .scraper import extract_dividend_data
from .phase1_scraper import extract_phase1_data  # DOM scraping - the working approach
import re

def extract_company_name_from_title(page_title: str, ticker: str):
    if not page_title:
        return None
    try:
        title = (
            page_title.replace(" | Charles Schwab", "")
            .replace(" - Charles Schwab", "")
            .replace("Stock Quote & Summary", "")
            .replace("Stock Research", "")
            .replace("Research", "")
            .replace("- Research", "")
        )
        pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)"
        match = re.match(pattern, title, re.IGNORECASE)
        if match:
            company_name = match.group(1).strip()
            company_name = company_name.replace(" -", "").strip()
            if len(company_name) > 1 and not company_name.isdigit():
                return company_name
        for separator in [" |", " -"]:
            if separator in title:
                potential_name = title.split(separator)[0].strip()
                if potential_name.upper() != ticker.upper() and len(potential_name) > 1:
                    return potential_name
        return None
    except Exception:
        return None


async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
    """Get Phase 1 enhanced equity data for a ticker.

    Extracts:
    - Quote/Price Data (symbol bar)
    - Enhanced Dividend Information (forward-looking dates)
    - Core Earnings Metrics (EPS, forecasts)
    - Basic Valuation Ratios (P/E, Forward P/E, PEG)
    - Calculated Metrics (payout ratio)

    Args:
        ticker: Stock ticker symbol
        debug: Enable debug logging

    Returns:
        Envelope containing EquityPhase1Data or error
    """
    ticker = ticker.upper()
    logger = logging.getLogger(__name__)
    if debug:
        logger.setLevel(logging.DEBUG)
        logger.debug(f"Starting get_equity_phase1_data for {ticker}")

    # Session management
    cookies = await ensure_cookies()
    if not cookies:
        return fail(
            "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
            ErrorType.AUTHENTICATION,
            retryable=False,
        )

    config = load_config()
    playwright_url = get_playwright_url(config)

    # Browser orchestration
    context = None
    page = None
    p, browser = await connect(playwright_url)
    try:
        context = await new_context(browser, cookies=cookies)
        page = await new_page(context)

        # Navigate to stock research page
        timeout = 30000 if debug else 45000
        success = await goto_with_auth_check(
            page,
            context,
            f"https://client.schwab.com/app/research/#/stocks/{ticker}",
            debug=debug,
            timeout=timeout,
        )
        if not success:
            return fail(
                "Authentication failed while navigating to research page",
                ErrorType.AUTHENTICATION,
                retryable=True,
            )

        # Validate ticker by checking for stock page content
        if debug:
            logger.debug(f"Current page URL: {page.url}")

        try:
            # Wait for stock-specific content to appear
            await page.wait_for_selector(
                'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
                timeout=10000,
                state='visible'
            )
        except Exception as wait_err:
            if debug:
                logger.debug(f"Timeout waiting for stock content: {wait_err}")
            return fail(
                f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
                ErrorType.VALIDATION,
                retryable=False,
            )

        # Validate content
        try:
            has_valid_content = await page.evaluate('''
                () => {
                    const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
                    if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
                        return true;
                    }
                    const morningstarSection = document.querySelector('#morningstar-section');
                    if (morningstarSection) {
                        return true;
                    }
                    return false;
                }
            ''')

            if not has_valid_content:
                return fail(
                    f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
                    ErrorType.VALIDATION,
                    retryable=False,
                )
        except Exception as e:
            logger.debug(f"Error checking for valid content: {e}")
            return fail(
                f"Invalid ticker: {ticker}. Unable to validate ticker.",
                ErrorType.VALIDATION,
                retryable=False,
            )

        # Extract Phase 1 data using improved DOM scraping
        # Note: API approach failed due to CORS restrictions
        phase1_data = await extract_phase1_data(page, debug=debug)

        return ok(phase1_data)

    finally:
        try:
            if page is not None:
                await page.close()
        except Exception:
            pass
        try:
            if context is not None:
                await context.close()
        except Exception:
            pass
        for handle in (browser,):
            try:
                if handle is not None:
                    await handle.close()
            except Exception:
                pass
        try:
            if p is not None:
                await p.stop()
        except Exception:
            pass


async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
    ticker = ticker.upper()
    ensure_cache_dir()
    logger = logging.getLogger(__name__)
    if debug:
        logger.setLevel(logging.DEBUG)
        logger.debug(f"Starting get_morningstar_data for {ticker}")

    # Session management
    cookies = await ensure_cookies()
    if not cookies:
        return fail(
            "Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
            ErrorType.AUTHENTICATION,
            retryable=False,
        )

    config = load_config()
    playwright_url = get_playwright_url(config)

    # Browser orchestration
    context = None
    page = None
    p, browser = await connect(playwright_url)
    try:
        context = await new_context(browser, cookies=cookies)
        page = await new_page(context)

        # Use shared auth-aware navigation helper for consistency
        # Use shorter timeout for tests to speed up execution
        timeout = 30000 if debug else 45000
        success = await goto_with_auth_check(
            page,
            context,
            f"https://client.schwab.com/app/research/#/stocks/{ticker}",
            debug=debug,
            timeout=timeout,
        )
        if not success:
            return fail(
                "Authentication failed while navigating to research page",
                ErrorType.AUTHENTICATION,
                retryable=True,
            )

        # Validate ticker by checking for stock page content
        # Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid
        if debug:
            logger.debug(f"Current page URL: {page.url}")

        # Wait for page content to load - Schwab's research page loads asynchronously
        # Give it time to populate the DOM before validation
        try:
            # Wait for either company name or Morningstar section to appear
            # This indicates the page has loaded stock-specific content
            await page.wait_for_selector(
                'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
                timeout=10000,
                state='visible'
            )
        except Exception as wait_err:
            # If neither selector appears after 10 seconds, likely an invalid ticker
            if debug:
                logger.debug(f"Timeout waiting for stock content: {wait_err}")
            return fail(
                f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
                ErrorType.VALIDATION,
                retryable=False,
            )

        # Additional validation: check if we have valid stock page content
        try:
            has_valid_content = await page.evaluate('''
                () => {
                    // Look for company name span (valid stock pages have this)
                    const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
                    if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
                        return true;
                    }

                    // Look for Morningstar section (valid stock pages have this)
                    const morningstarSection = document.querySelector('#morningstar-section');
                    if (morningstarSection) {
                        return true;
                    }

                    // Look for company profile description (valid stock pages have this)
                    const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
                    if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) {
                        return true;
                    }

                    // Look for any stock-related content
                    const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]');
                    if (stockContent) {
                        return true;
                    }

                    return false;
                }
            ''')

            if debug:
                logger.debug(f"Valid stock content detected: {has_valid_content}")

            if not has_valid_content:
                if debug:
                    logger.debug(f"Invalid ticker detected - no stock content found")
                return fail(
                    f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
                    ErrorType.VALIDATION,
                    retryable=False,
                )
        except Exception as e:
            logger.debug(f"Error checking for valid content: {e}")
            # If we can't check, assume invalid and return error
            return fail(
                f"Invalid ticker: {ticker}. Unable to validate ticker.",
                ErrorType.VALIDATION,
                retryable=False,
            )

        # Company name - extract from page elements
        company_name = None
        try:
            # Strategy 1: Extract from company name span element
            company_name = await page.evaluate('''
                () => {
                    // Look for company name in title span
                    const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
                    if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
                        return nameSpan.textContent.trim();
                    }

                    // Fallback: Extract from company profile description
                    const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
                    if (profileText && profileText.textContent) {
                        const text = profileText.textContent.trim();
                        // Extract company name before " designs" or " is" or " provides"
                        const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i);
                        if (match) {
                            return match[1].trim();
                        }
                    }

                    return null;
                }
            ''')
            if debug and company_name:
                logger.debug(f"Extracted company name: {company_name}")
        except Exception as e:
            logger.debug(f"Company name extraction error: {e}")

        # Morningstar section wait
        try:
            await page.wait_for_selector('#morningstar-section', timeout=30000)
        except Exception:
            logger.debug("#morningstar-section not found within timeout")

        # Dividends
        try:
            dividend_data = await extract_dividend_data(page, debug=debug)
        except Exception as exc:
            logger.debug(f"Dividend extraction error: {exc}")
            dividend_data = {}

        # Find report and download/cache
        report_url, report_date = await find_report(page, debug=debug)
        data: Dict[str, Any] = {}
        if report_date:
            data["Morningstar Equity Report Date"] = report_date.strip()
        if report_url:
            # Only store actual URL, not the __CLICK_TO_OPEN__ marker
            if report_url != '__CLICK_TO_OPEN__':
                data["Morningstar Equity Report URL"] = report_url
            pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug)
        else:
            pdf_bytes = None

        parsed_data: Dict[str, Any] = {}
        if pdf_bytes:
            if report_date:
                from datetime import datetime
                try:
                    dt = datetime.strptime(report_date, "%b %d, %Y")
                    formatted_date = dt.strftime("%m-%d-%Y")
                except Exception:
                    formatted_date = report_date.replace(" ", "-")
            else:
                formatted_date = time.strftime("%m-%d-%Y")
            write_cached_pdf(ticker, formatted_date, pdf_bytes)
            try:
                parsed_data = parse_pdf(pdf_bytes)
                parsed_data["source"] = "live"
            except Exception as exc:
                logger.debug(f"PDF parsing failed: {exc}")
                parsed_data = {"error": "Failed to parse Morningstar report"}
        else:
            cached = read_cached_pdf(ticker)
            if cached:
                try:
                    parsed_data = parse_pdf(cached)
                    parsed_data["source"] = "cache"
                except Exception as exc:
                    logger.debug(f"Cached PDF parsing failed: {exc}")
                    parsed_data = {"error": "Failed to parse cached Morningstar report"}
            else:
                parsed_data = {"error": f"Failed to download and no cache available for {ticker}"}

        morningstar = MorningstarData(
            ticker=ticker,
            company_name=company_name,
            previous_dividend_payment=dividend_data.get("Previous Dividend Payment"),
            previous_pay_date=dividend_data.get("Previous Pay Date"),
            previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"),
            frequency=dividend_data.get("Frequency"),
            annual_dividend_rate=dividend_data.get("Annual Dividend Rate"),
            annual_dividend_yield=dividend_data.get("Annual Dividend Yield"),
            fair_value=parsed_data.get("Fair Value"),
            economic_moat=parsed_data.get("Economic Moat"),
            capital_allocation=parsed_data.get("Capital Allocation"),
            rating=_safe_int(parsed_data.get("Morningstar Rating")),
            one_star_price=parsed_data.get("1-Star Price"),
            five_star_price=parsed_data.get("5-Star Price"),
            assessment=parsed_data.get("Assessment"),
            range_52_week=parsed_data.get("52-Week Range"),
            dividend_yield=parsed_data.get("Dividend Yield"),
            investment_style=parsed_data.get("Investment Style"),
            report_url=data.get("Morningstar Equity Report URL"),
            report_date=data.get("Morningstar Equity Report Date"),
            source=parsed_data.get("source"),
        )

        if parsed_data.get("error"):
            return fail(parsed_data["error"], ErrorType.PARSING, retryable=True)

        return ok(morningstar)

    finally:
        try:
            if page is not None:
                await page.close()
        except Exception:
            pass
        try:
            if context is not None:
                await context.close()
        except Exception:
            pass
        for handle in (browser,):
            try:
                if handle is not None:
                    await handle.close()
            except Exception:
                pass
        try:
            if p is not None:
                await p.stop()
        except Exception:
            pass


def _safe_int(value: Any) -> Optional[int]:
    if value is None:
        return None
    try:
        return int(str(value).strip())
    except (TypeError, ValueError):
        return None