Files
schwab-mcp-custom/schwab_scraper/features/equity/service.py
b3nw 650ea2d087
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
Fix build: Bundle schwab_scraper source and use local dependencies
2026-04-24 01:50:20 +00:00

453 lines
18 KiB
Python

import time
from typing import Any, Dict, Optional
import logging
from ...core.config import load_config, get_playwright_url
from ...browser.auth import ensure_cookies
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok
from .morningstar import find_report, download_report_as_bytes
from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf
from .parser import parse as parse_pdf
from .scraper import extract_dividend_data
from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach
import re
def extract_company_name_from_title(page_title: str, ticker: str):
if not page_title:
return None
try:
title = (
page_title.replace(" | Charles Schwab", "")
.replace(" - Charles Schwab", "")
.replace("Stock Quote & Summary", "")
.replace("Stock Research", "")
.replace("Research", "")
.replace("- Research", "")
)
pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)"
match = re.match(pattern, title, re.IGNORECASE)
if match:
company_name = match.group(1).strip()
company_name = company_name.replace(" -", "").strip()
if len(company_name) > 1 and not company_name.isdigit():
return company_name
for separator in [" |", " -"]:
if separator in title:
potential_name = title.split(separator)[0].strip()
if potential_name.upper() != ticker.upper() and len(potential_name) > 1:
return potential_name
return None
except Exception:
return None
async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
"""Get Phase 1 enhanced equity data for a ticker.
Extracts:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
Args:
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
Envelope containing EquityPhase1Data or error
"""
ticker = ticker.upper()
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug(f"Starting get_equity_phase1_data for {ticker}")
# Session management
cookies = await ensure_cookies()
if not cookies:
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
# Browser orchestration
context = None
page = None
p, browser = await connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Navigate to stock research page
timeout = 30000 if debug else 45000
success = await goto_with_auth_check(
page,
context,
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
debug=debug,
timeout=timeout,
)
if not success:
return fail(
"Authentication failed while navigating to research page",
ErrorType.AUTHENTICATION,
retryable=True,
)
# Validate ticker by checking for stock page content
if debug:
logger.debug(f"Current page URL: {page.url}")
try:
# Wait for stock-specific content to appear
await page.wait_for_selector(
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
timeout=10000,
state='visible'
)
except Exception as wait_err:
if debug:
logger.debug(f"Timeout waiting for stock content: {wait_err}")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Validate content
try:
has_valid_content = await page.evaluate('''
() => {
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return true;
}
const morningstarSection = document.querySelector('#morningstar-section');
if (morningstarSection) {
return true;
}
return false;
}
''')
if not has_valid_content:
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
except Exception as e:
logger.debug(f"Error checking for valid content: {e}")
return fail(
f"Invalid ticker: {ticker}. Unable to validate ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Extract Phase 1 data using improved DOM scraping
# Note: API approach failed due to CORS restrictions
phase1_data = await extract_phase1_data(page, debug=debug)
return ok(phase1_data)
finally:
try:
if page is not None:
await page.close()
except Exception:
pass
try:
if context is not None:
await context.close()
except Exception:
pass
for handle in (browser,):
try:
if handle is not None:
await handle.close()
except Exception:
pass
try:
if p is not None:
await p.stop()
except Exception:
pass
async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
ticker = ticker.upper()
ensure_cache_dir()
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug(f"Starting get_morningstar_data for {ticker}")
# Session management
cookies = await ensure_cookies()
if not cookies:
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
# Browser orchestration
context = None
page = None
p, browser = await connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Use shared auth-aware navigation helper for consistency
# Use shorter timeout for tests to speed up execution
timeout = 30000 if debug else 45000
success = await goto_with_auth_check(
page,
context,
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
debug=debug,
timeout=timeout,
)
if not success:
return fail(
"Authentication failed while navigating to research page",
ErrorType.AUTHENTICATION,
retryable=True,
)
# Validate ticker by checking for stock page content
# Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid
if debug:
logger.debug(f"Current page URL: {page.url}")
# Wait for page content to load - Schwab's research page loads asynchronously
# Give it time to populate the DOM before validation
try:
# Wait for either company name or Morningstar section to appear
# This indicates the page has loaded stock-specific content
await page.wait_for_selector(
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
timeout=10000,
state='visible'
)
except Exception as wait_err:
# If neither selector appears after 10 seconds, likely an invalid ticker
if debug:
logger.debug(f"Timeout waiting for stock content: {wait_err}")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Additional validation: check if we have valid stock page content
try:
has_valid_content = await page.evaluate('''
() => {
// Look for company name span (valid stock pages have this)
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return true;
}
// Look for Morningstar section (valid stock pages have this)
const morningstarSection = document.querySelector('#morningstar-section');
if (morningstarSection) {
return true;
}
// Look for company profile description (valid stock pages have this)
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) {
return true;
}
// Look for any stock-related content
const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]');
if (stockContent) {
return true;
}
return false;
}
''')
if debug:
logger.debug(f"Valid stock content detected: {has_valid_content}")
if not has_valid_content:
if debug:
logger.debug(f"Invalid ticker detected - no stock content found")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
except Exception as e:
logger.debug(f"Error checking for valid content: {e}")
# If we can't check, assume invalid and return error
return fail(
f"Invalid ticker: {ticker}. Unable to validate ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Company name - extract from page elements
company_name = None
try:
# Strategy 1: Extract from company name span element
company_name = await page.evaluate('''
() => {
// Look for company name in title span
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return nameSpan.textContent.trim();
}
// Fallback: Extract from company profile description
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
if (profileText && profileText.textContent) {
const text = profileText.textContent.trim();
// Extract company name before " designs" or " is" or " provides"
const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i);
if (match) {
return match[1].trim();
}
}
return null;
}
''')
if debug and company_name:
logger.debug(f"Extracted company name: {company_name}")
except Exception as e:
logger.debug(f"Company name extraction error: {e}")
# Morningstar section wait
try:
await page.wait_for_selector('#morningstar-section', timeout=30000)
except Exception:
logger.debug("#morningstar-section not found within timeout")
# Dividends
try:
dividend_data = await extract_dividend_data(page, debug=debug)
except Exception as exc:
logger.debug(f"Dividend extraction error: {exc}")
dividend_data = {}
# Find report and download/cache
report_url, report_date = await find_report(page, debug=debug)
data: Dict[str, Any] = {}
if report_date:
data["Morningstar Equity Report Date"] = report_date.strip()
if report_url:
# Only store actual URL, not the __CLICK_TO_OPEN__ marker
if report_url != '__CLICK_TO_OPEN__':
data["Morningstar Equity Report URL"] = report_url
pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug)
else:
pdf_bytes = None
parsed_data: Dict[str, Any] = {}
if pdf_bytes:
if report_date:
from datetime import datetime
try:
dt = datetime.strptime(report_date, "%b %d, %Y")
formatted_date = dt.strftime("%m-%d-%Y")
except Exception:
formatted_date = report_date.replace(" ", "-")
else:
formatted_date = time.strftime("%m-%d-%Y")
write_cached_pdf(ticker, formatted_date, pdf_bytes)
try:
parsed_data = parse_pdf(pdf_bytes)
parsed_data["source"] = "live"
except Exception as exc:
logger.debug(f"PDF parsing failed: {exc}")
parsed_data = {"error": "Failed to parse Morningstar report"}
else:
cached = read_cached_pdf(ticker)
if cached:
try:
parsed_data = parse_pdf(cached)
parsed_data["source"] = "cache"
except Exception as exc:
logger.debug(f"Cached PDF parsing failed: {exc}")
parsed_data = {"error": "Failed to parse cached Morningstar report"}
else:
parsed_data = {"error": f"Failed to download and no cache available for {ticker}"}
morningstar = MorningstarData(
ticker=ticker,
company_name=company_name,
previous_dividend_payment=dividend_data.get("Previous Dividend Payment"),
previous_pay_date=dividend_data.get("Previous Pay Date"),
previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"),
frequency=dividend_data.get("Frequency"),
annual_dividend_rate=dividend_data.get("Annual Dividend Rate"),
annual_dividend_yield=dividend_data.get("Annual Dividend Yield"),
fair_value=parsed_data.get("Fair Value"),
economic_moat=parsed_data.get("Economic Moat"),
capital_allocation=parsed_data.get("Capital Allocation"),
rating=_safe_int(parsed_data.get("Morningstar Rating")),
one_star_price=parsed_data.get("1-Star Price"),
five_star_price=parsed_data.get("5-Star Price"),
assessment=parsed_data.get("Assessment"),
range_52_week=parsed_data.get("52-Week Range"),
dividend_yield=parsed_data.get("Dividend Yield"),
investment_style=parsed_data.get("Investment Style"),
report_url=data.get("Morningstar Equity Report URL"),
report_date=data.get("Morningstar Equity Report Date"),
source=parsed_data.get("source"),
)
if parsed_data.get("error"):
return fail(parsed_data["error"], ErrorType.PARSING, retryable=True)
return ok(morningstar)
finally:
try:
if page is not None:
await page.close()
except Exception:
pass
try:
if context is not None:
await context.close()
except Exception:
pass
for handle in (browser,):
try:
if handle is not None:
await handle.close()
except Exception:
pass
try:
if p is not None:
await p.stop()
except Exception:
pass
def _safe_int(value: Any) -> Optional[int]:
if value is None:
return None
try:
return int(str(value).strip())
except (TypeError, ValueError):
return None