All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
453 lines
18 KiB
Python
453 lines
18 KiB
Python
import time
|
|
from typing import Any, Dict, Optional
|
|
import logging
|
|
from ...core.config import load_config, get_playwright_url
|
|
from ...browser.auth import ensure_cookies
|
|
from ...browser.client import connect, new_context, new_page
|
|
from ...browser.navigation import goto_with_auth_check
|
|
from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok
|
|
from .morningstar import find_report, download_report_as_bytes
|
|
from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf
|
|
from .parser import parse as parse_pdf
|
|
from .scraper import extract_dividend_data
|
|
from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach
|
|
import re
|
|
|
|
def extract_company_name_from_title(page_title: str, ticker: str):
|
|
if not page_title:
|
|
return None
|
|
try:
|
|
title = (
|
|
page_title.replace(" | Charles Schwab", "")
|
|
.replace(" - Charles Schwab", "")
|
|
.replace("Stock Quote & Summary", "")
|
|
.replace("Stock Research", "")
|
|
.replace("Research", "")
|
|
.replace("- Research", "")
|
|
)
|
|
pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)"
|
|
match = re.match(pattern, title, re.IGNORECASE)
|
|
if match:
|
|
company_name = match.group(1).strip()
|
|
company_name = company_name.replace(" -", "").strip()
|
|
if len(company_name) > 1 and not company_name.isdigit():
|
|
return company_name
|
|
for separator in [" |", " -"]:
|
|
if separator in title:
|
|
potential_name = title.split(separator)[0].strip()
|
|
if potential_name.upper() != ticker.upper() and len(potential_name) > 1:
|
|
return potential_name
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
|
|
"""Get Phase 1 enhanced equity data for a ticker.
|
|
|
|
Extracts:
|
|
- Quote/Price Data (symbol bar)
|
|
- Enhanced Dividend Information (forward-looking dates)
|
|
- Core Earnings Metrics (EPS, forecasts)
|
|
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
|
|
- Calculated Metrics (payout ratio)
|
|
|
|
Args:
|
|
ticker: Stock ticker symbol
|
|
debug: Enable debug logging
|
|
|
|
Returns:
|
|
Envelope containing EquityPhase1Data or error
|
|
"""
|
|
ticker = ticker.upper()
|
|
logger = logging.getLogger(__name__)
|
|
if debug:
|
|
logger.setLevel(logging.DEBUG)
|
|
logger.debug(f"Starting get_equity_phase1_data for {ticker}")
|
|
|
|
# Session management
|
|
cookies = await ensure_cookies()
|
|
if not cookies:
|
|
return fail(
|
|
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
|
|
ErrorType.AUTHENTICATION,
|
|
retryable=False,
|
|
)
|
|
|
|
config = load_config()
|
|
playwright_url = get_playwright_url(config)
|
|
|
|
# Browser orchestration
|
|
context = None
|
|
page = None
|
|
p, browser = await connect(playwright_url)
|
|
try:
|
|
context = await new_context(browser, cookies=cookies)
|
|
page = await new_page(context)
|
|
|
|
# Navigate to stock research page
|
|
timeout = 30000 if debug else 45000
|
|
success = await goto_with_auth_check(
|
|
page,
|
|
context,
|
|
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
|
|
debug=debug,
|
|
timeout=timeout,
|
|
)
|
|
if not success:
|
|
return fail(
|
|
"Authentication failed while navigating to research page",
|
|
ErrorType.AUTHENTICATION,
|
|
retryable=True,
|
|
)
|
|
|
|
# Validate ticker by checking for stock page content
|
|
if debug:
|
|
logger.debug(f"Current page URL: {page.url}")
|
|
|
|
try:
|
|
# Wait for stock-specific content to appear
|
|
await page.wait_for_selector(
|
|
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
|
|
timeout=10000,
|
|
state='visible'
|
|
)
|
|
except Exception as wait_err:
|
|
if debug:
|
|
logger.debug(f"Timeout waiting for stock content: {wait_err}")
|
|
return fail(
|
|
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
|
ErrorType.VALIDATION,
|
|
retryable=False,
|
|
)
|
|
|
|
# Validate content
|
|
try:
|
|
has_valid_content = await page.evaluate('''
|
|
() => {
|
|
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
|
|
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
|
|
return true;
|
|
}
|
|
const morningstarSection = document.querySelector('#morningstar-section');
|
|
if (morningstarSection) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
''')
|
|
|
|
if not has_valid_content:
|
|
return fail(
|
|
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
|
ErrorType.VALIDATION,
|
|
retryable=False,
|
|
)
|
|
except Exception as e:
|
|
logger.debug(f"Error checking for valid content: {e}")
|
|
return fail(
|
|
f"Invalid ticker: {ticker}. Unable to validate ticker.",
|
|
ErrorType.VALIDATION,
|
|
retryable=False,
|
|
)
|
|
|
|
# Extract Phase 1 data using improved DOM scraping
|
|
# Note: API approach failed due to CORS restrictions
|
|
phase1_data = await extract_phase1_data(page, debug=debug)
|
|
|
|
return ok(phase1_data)
|
|
|
|
finally:
|
|
try:
|
|
if page is not None:
|
|
await page.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if context is not None:
|
|
await context.close()
|
|
except Exception:
|
|
pass
|
|
for handle in (browser,):
|
|
try:
|
|
if handle is not None:
|
|
await handle.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if p is not None:
|
|
await p.stop()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
|
|
ticker = ticker.upper()
|
|
ensure_cache_dir()
|
|
logger = logging.getLogger(__name__)
|
|
if debug:
|
|
logger.setLevel(logging.DEBUG)
|
|
logger.debug(f"Starting get_morningstar_data for {ticker}")
|
|
|
|
# Session management
|
|
cookies = await ensure_cookies()
|
|
if not cookies:
|
|
return fail(
|
|
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
|
|
ErrorType.AUTHENTICATION,
|
|
retryable=False,
|
|
)
|
|
|
|
config = load_config()
|
|
playwright_url = get_playwright_url(config)
|
|
|
|
# Browser orchestration
|
|
context = None
|
|
page = None
|
|
p, browser = await connect(playwright_url)
|
|
try:
|
|
context = await new_context(browser, cookies=cookies)
|
|
page = await new_page(context)
|
|
|
|
# Use shared auth-aware navigation helper for consistency
|
|
# Use shorter timeout for tests to speed up execution
|
|
timeout = 30000 if debug else 45000
|
|
success = await goto_with_auth_check(
|
|
page,
|
|
context,
|
|
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
|
|
debug=debug,
|
|
timeout=timeout,
|
|
)
|
|
if not success:
|
|
return fail(
|
|
"Authentication failed while navigating to research page",
|
|
ErrorType.AUTHENTICATION,
|
|
retryable=True,
|
|
)
|
|
|
|
# Validate ticker by checking for stock page content
|
|
# Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid
|
|
if debug:
|
|
logger.debug(f"Current page URL: {page.url}")
|
|
|
|
# Wait for page content to load - Schwab's research page loads asynchronously
|
|
# Give it time to populate the DOM before validation
|
|
try:
|
|
# Wait for either company name or Morningstar section to appear
|
|
# This indicates the page has loaded stock-specific content
|
|
await page.wait_for_selector(
|
|
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
|
|
timeout=10000,
|
|
state='visible'
|
|
)
|
|
except Exception as wait_err:
|
|
# If neither selector appears after 10 seconds, likely an invalid ticker
|
|
if debug:
|
|
logger.debug(f"Timeout waiting for stock content: {wait_err}")
|
|
return fail(
|
|
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
|
ErrorType.VALIDATION,
|
|
retryable=False,
|
|
)
|
|
|
|
# Additional validation: check if we have valid stock page content
|
|
try:
|
|
has_valid_content = await page.evaluate('''
|
|
() => {
|
|
// Look for company name span (valid stock pages have this)
|
|
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
|
|
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
|
|
return true;
|
|
}
|
|
|
|
// Look for Morningstar section (valid stock pages have this)
|
|
const morningstarSection = document.querySelector('#morningstar-section');
|
|
if (morningstarSection) {
|
|
return true;
|
|
}
|
|
|
|
// Look for company profile description (valid stock pages have this)
|
|
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
|
|
if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) {
|
|
return true;
|
|
}
|
|
|
|
// Look for any stock-related content
|
|
const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]');
|
|
if (stockContent) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
''')
|
|
|
|
if debug:
|
|
logger.debug(f"Valid stock content detected: {has_valid_content}")
|
|
|
|
if not has_valid_content:
|
|
if debug:
|
|
logger.debug(f"Invalid ticker detected - no stock content found")
|
|
return fail(
|
|
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
|
ErrorType.VALIDATION,
|
|
retryable=False,
|
|
)
|
|
except Exception as e:
|
|
logger.debug(f"Error checking for valid content: {e}")
|
|
# If we can't check, assume invalid and return error
|
|
return fail(
|
|
f"Invalid ticker: {ticker}. Unable to validate ticker.",
|
|
ErrorType.VALIDATION,
|
|
retryable=False,
|
|
)
|
|
|
|
# Company name - extract from page elements
|
|
company_name = None
|
|
try:
|
|
# Strategy 1: Extract from company name span element
|
|
company_name = await page.evaluate('''
|
|
() => {
|
|
// Look for company name in title span
|
|
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
|
|
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
|
|
return nameSpan.textContent.trim();
|
|
}
|
|
|
|
// Fallback: Extract from company profile description
|
|
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
|
|
if (profileText && profileText.textContent) {
|
|
const text = profileText.textContent.trim();
|
|
// Extract company name before " designs" or " is" or " provides"
|
|
const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i);
|
|
if (match) {
|
|
return match[1].trim();
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
''')
|
|
if debug and company_name:
|
|
logger.debug(f"Extracted company name: {company_name}")
|
|
except Exception as e:
|
|
logger.debug(f"Company name extraction error: {e}")
|
|
|
|
# Morningstar section wait
|
|
try:
|
|
await page.wait_for_selector('#morningstar-section', timeout=30000)
|
|
except Exception:
|
|
logger.debug("#morningstar-section not found within timeout")
|
|
|
|
# Dividends
|
|
try:
|
|
dividend_data = await extract_dividend_data(page, debug=debug)
|
|
except Exception as exc:
|
|
logger.debug(f"Dividend extraction error: {exc}")
|
|
dividend_data = {}
|
|
|
|
# Find report and download/cache
|
|
report_url, report_date = await find_report(page, debug=debug)
|
|
data: Dict[str, Any] = {}
|
|
if report_date:
|
|
data["Morningstar Equity Report Date"] = report_date.strip()
|
|
if report_url:
|
|
# Only store actual URL, not the __CLICK_TO_OPEN__ marker
|
|
if report_url != '__CLICK_TO_OPEN__':
|
|
data["Morningstar Equity Report URL"] = report_url
|
|
pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug)
|
|
else:
|
|
pdf_bytes = None
|
|
|
|
parsed_data: Dict[str, Any] = {}
|
|
if pdf_bytes:
|
|
if report_date:
|
|
from datetime import datetime
|
|
try:
|
|
dt = datetime.strptime(report_date, "%b %d, %Y")
|
|
formatted_date = dt.strftime("%m-%d-%Y")
|
|
except Exception:
|
|
formatted_date = report_date.replace(" ", "-")
|
|
else:
|
|
formatted_date = time.strftime("%m-%d-%Y")
|
|
write_cached_pdf(ticker, formatted_date, pdf_bytes)
|
|
try:
|
|
parsed_data = parse_pdf(pdf_bytes)
|
|
parsed_data["source"] = "live"
|
|
except Exception as exc:
|
|
logger.debug(f"PDF parsing failed: {exc}")
|
|
parsed_data = {"error": "Failed to parse Morningstar report"}
|
|
else:
|
|
cached = read_cached_pdf(ticker)
|
|
if cached:
|
|
try:
|
|
parsed_data = parse_pdf(cached)
|
|
parsed_data["source"] = "cache"
|
|
except Exception as exc:
|
|
logger.debug(f"Cached PDF parsing failed: {exc}")
|
|
parsed_data = {"error": "Failed to parse cached Morningstar report"}
|
|
else:
|
|
parsed_data = {"error": f"Failed to download and no cache available for {ticker}"}
|
|
|
|
morningstar = MorningstarData(
|
|
ticker=ticker,
|
|
company_name=company_name,
|
|
previous_dividend_payment=dividend_data.get("Previous Dividend Payment"),
|
|
previous_pay_date=dividend_data.get("Previous Pay Date"),
|
|
previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"),
|
|
frequency=dividend_data.get("Frequency"),
|
|
annual_dividend_rate=dividend_data.get("Annual Dividend Rate"),
|
|
annual_dividend_yield=dividend_data.get("Annual Dividend Yield"),
|
|
fair_value=parsed_data.get("Fair Value"),
|
|
economic_moat=parsed_data.get("Economic Moat"),
|
|
capital_allocation=parsed_data.get("Capital Allocation"),
|
|
rating=_safe_int(parsed_data.get("Morningstar Rating")),
|
|
one_star_price=parsed_data.get("1-Star Price"),
|
|
five_star_price=parsed_data.get("5-Star Price"),
|
|
assessment=parsed_data.get("Assessment"),
|
|
range_52_week=parsed_data.get("52-Week Range"),
|
|
dividend_yield=parsed_data.get("Dividend Yield"),
|
|
investment_style=parsed_data.get("Investment Style"),
|
|
report_url=data.get("Morningstar Equity Report URL"),
|
|
report_date=data.get("Morningstar Equity Report Date"),
|
|
source=parsed_data.get("source"),
|
|
)
|
|
|
|
if parsed_data.get("error"):
|
|
return fail(parsed_data["error"], ErrorType.PARSING, retryable=True)
|
|
|
|
return ok(morningstar)
|
|
|
|
finally:
|
|
try:
|
|
if page is not None:
|
|
await page.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if context is not None:
|
|
await context.close()
|
|
except Exception:
|
|
pass
|
|
for handle in (browser,):
|
|
try:
|
|
if handle is not None:
|
|
await handle.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if p is not None:
|
|
await p.stop()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _safe_int(value: Any) -> Optional[int]:
|
|
if value is None:
|
|
return None
|
|
try:
|
|
return int(str(value).strip())
|
|
except (TypeError, ValueError):
|
|
return None
|