Files
schwab-mcp-custom/schwab_scraper/features/equity/phase1_scraper.py
b3nw 650ea2d087
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
Fix build: Bundle schwab_scraper source and use local dependencies
2026-04-24 01:50:20 +00:00

787 lines
35 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Phase 1: Essential Dividend Metrics Implementation (DEPRECATED)
⚠️ DEPRECATED: This DOM-scraping based approach has been replaced by phase1_api_scraper.py
which uses Schwab's REST APIs directly. The API approach is more reliable, complete,
and maintainable than DOM scraping.
This module is kept for reference only. New code should use phase1_api_scraper.py.
Old approach extracts from DOM:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
"""
from typing import Dict, Any, Optional
import re
import logging
from ...core import QuoteData, EnhancedDividends, EarningsData, CalculatedMetrics, EquityPhase1Data
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float, handling $ and % symbols."""
if value is None:
return None
try:
# Remove common formatting characters
clean = str(value).strip().replace('$', '').replace(',', '').replace('%', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return float(clean)
except (ValueError, AttributeError):
pass
return None
def _parse_int(value: Any) -> Optional[int]:
"""Safely parse a value to int."""
if value is None:
return None
try:
clean = str(value).strip().replace(',', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return int(float(clean))
except (ValueError, AttributeError):
pass
return None
def _parse_volume(volume_str: str) -> Optional[int]:
"""Parse volume string like '8M', '22.4M', '1.2B' to integer."""
if not volume_str:
return None
try:
volume_str = volume_str.strip().upper()
multiplier = 1
if volume_str.endswith('K'):
multiplier = 1_000
volume_str = volume_str[:-1]
elif volume_str.endswith('M'):
multiplier = 1_000_000
volume_str = volume_str[:-1]
elif volume_str.endswith('B'):
multiplier = 1_000_000_000
volume_str = volume_str[:-1]
value = float(volume_str)
return int(value * multiplier)
except (ValueError, AttributeError):
return None
def _parse_revenue(revenue_str: str) -> Optional[float]:
"""Parse revenue string like '$92.15B', '$1.5M' to dollar value."""
if not revenue_str:
return None
try:
revenue_str = revenue_str.strip().upper().replace('$', '').replace(',', '')
multiplier = 1
if revenue_str.endswith('K'):
multiplier = 1_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('M'):
multiplier = 1_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('B'):
multiplier = 1_000_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('T'):
multiplier = 1_000_000_000_000
revenue_str = revenue_str[:-1]
value = float(revenue_str)
return value * multiplier
except (ValueError, AttributeError):
return None
async def extract_quote_data(page, ticker: str = "", debug: bool = False) -> QuoteData:
"""Extract quote/price data from symbol bar.
Args:
page: Playwright page object
ticker: Stock ticker symbol (for pattern matching)
debug: Enable debug logging
Returns:
QuoteData object with extracted fields
"""
quote = QuoteData()
try:
if debug:
logger.debug("Starting quote data extraction...")
# Wait for symbol bar content (look for key labels)
try:
await page.wait_for_selector('#app-symbol-bar-component, text=Previous close', state='attached', timeout=15000)
except Exception:
if debug:
logger.debug("Timeout waiting for symbol bar selector, attempting to parse whatever is there")
# Extract symbol bar text content (fallback to body if specific component not found)
symbol_bar_text = await page.evaluate('''
() => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) return symbolBar.textContent;
// If specific component not found, try to find the container with market data
// Look for container with "Previous close"
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
// Return the parent's text content (go up a few levels to capture all data)
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
return document.body.textContent || '';
}
''')
if debug:
logger.debug(f"Symbol bar text (first 500 chars): {symbol_bar_text[:500]}")
# Extract structured data
quote_data = await page.evaluate(r'''
(ticker) => {
const data = {};
// Helper to get text content from page
const getText = () => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
// Verify it looks like the right component by checking for "Previous close"
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) {
return symbolBar.textContent;
}
// Fallback logic
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
// Last resort: body text
return document.body.textContent || '';
};
const fullText = getText();
// Try to find price in quote container first for accuracy
const priceElement = document.querySelector('.symbol-quote-container, [data-testid="quote-price"]');
if (priceElement) {
const priceText = priceElement.textContent || '';
const priceMatch = priceText.match(/\$([0-9,]+\.[0-9]+)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
} else {
// Fallback regex for price if element not found
// Look for price near top or just regex
const priceMatch = fullText.match(/\$([0-9,]+\.[0-9]{2})(\s|[+-]|$)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
}
// After hours (using \s* for robustness)
const afterHoursMatch = fullText.match(/After hours:?\s*\$([0-9,.]+)/i);
if (afterHoursMatch) data.after_hours_price = afterHoursMatch[1].replace(',', '');
const afterHoursChangeMatch = fullText.match(/After hours:.*?([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
if (afterHoursChangeMatch) {
data.after_hours_change = afterHoursChangeMatch[1].replace('$', '').replace(',', '');
data.after_hours_change_percent = afterHoursChangeMatch[2];
}
// Bid/Ask (using \s* for robustness)
const bidMatch = fullText.match(/Bid\s*\$([0-9,.]+)/i);
if (bidMatch) data.bid = bidMatch[1].replace(',', '');
const askMatch = fullText.match(/Ask\s*\$([0-9,.]+)/i);
if (askMatch) data.ask = askMatch[1].replace(',', '');
const bidAskSizeMatch = fullText.match(/Bid\/Ask Size\s*([0-9]+\/[0-9]+)/i);
if (bidAskSizeMatch) data.bid_ask_size = bidAskSizeMatch[1];
// Previous close and open (using \s* instead of \s+)
const prevCloseMatch = fullText.match(/Previous close\s*\$([0-9,.]+)/i);
if (prevCloseMatch) data.previous_close = prevCloseMatch[1].replace(',', '');
const openMatch = fullText.match(/Today's open\s*\$([0-9,.]+)/i);
if (openMatch) data.open = openMatch[1].replace(',', '');
// Volume (using \s*)
const volumeMatch = fullText.match(/Today's volume\s*([0-9.]+[KMB]?)/i);
if (volumeMatch) data.volume = volumeMatch[1];
const volumeVsAvgMatch = fullText.match(/Today's volume\s*[0-9.]+[KMB]?\s*(Above Avg\.|Below Avg\.|Average)/i);
if (volumeVsAvgMatch) data.volume_vs_avg = volumeVsAvgMatch[1];
// Day range
// Pattern: "Today's range low $200.81 Today's range high $203.45" or similar
// We'll look for "low $X" and "high $Y" appearing after "Today's range"
const dayRangeMatch = fullText.match(/Today's range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (dayRangeMatch) {
data.day_range_low = dayRangeMatch[1].replace(',', '');
data.day_range_high = dayRangeMatch[2].replace(',', '');
}
// 52-week range
const weekRangeMatch = fullText.match(/52-week range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (weekRangeMatch) {
data.week_52_low = weekRangeMatch[1].replace(',', '');
data.week_52_high = weekRangeMatch[2].replace(',', '');
}
// Market cap (may be in Share Profile section)
const marketCapMatch = fullText.match(/Market Cap\s*\$([0-9.]+[KMBT])/i);
if (marketCapMatch) data.market_cap = marketCapMatch[1];
// Change and change percent
// Try specific formatted pattern first: TICKER $PRICE CHANGE CHANGE%
// e.g. "JNJ $201.95 -1.03 -0.51%"
const standardPattern = fullText.match(/\$([0-9,.]+)\s*([+-]?[0-9,.]+)\s*([+-]?[0-9.]+)%/);
if (standardPattern) {
if (!data.price) data.price = standardPattern[1].replace(',', '');
data.change = standardPattern[2];
data.change_percent = standardPattern[3];
}
let percentMatch = null;
if (ticker && !data.change_percent) {
// Match: TICKER$digits.digits{2}percent%
const tickerPattern = new RegExp(ticker + '\\\\.?[\\s]*\\$([0-9,]+\\\\.[0-9]{2})[\\s]*([0-9.]+)%', 'i');
percentMatch = fullText.match(tickerPattern);
if (percentMatch) {
data.change_percent = percentMatch[2];
}
}
if (!data.change_percent) {
// Fallback: match any price+percent pattern with space
const fallbackMatch = fullText.match(/\$[0-9,.]+\s*([+-]?[0-9.]+)%/);
if (fallbackMatch) {
data.change_percent = fallbackMatch[1];
}
}
// Pattern 2: "+$1.23 (+0.45%)" or "-$1.23 (-0.45%)"
let changeMatch = fullText.match(/([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/);
// Pattern 3: "$193.08 +1.23 +0.64%" (price followed by change)
if (!changeMatch) {
changeMatch = fullText.match(/\$[0-9,.]+\s*([+-][0-9,.]+)\s*([+-][0-9.]+)%/);
}
// Pattern 4: "Change: +1.23 (+0.64%)"
if (!changeMatch) {
changeMatch = fullText.match(/Change:?\s*([+-][0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
}
if (changeMatch) {
data.change = changeMatch[1].replace('$', '').replace(',', '');
if (!data.change_percent) {
data.change_percent = changeMatch[2].replace(/[+]/g, '');
}
}
// Exchange - look for NYSE, NASDAQ, etc.
const exchangeMatch = fullText.match(/\b(NYSE|NASDAQ|AMEX|OTC|BATS)\b/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
return data;
}
''', ticker)
# Parse and assign values
quote.price = _parse_float(quote_data.get('price'))
quote.change = _parse_float(quote_data.get('change'))
quote.change_percent = _parse_float(quote_data.get('change_percent'))
quote.after_hours_price = _parse_float(quote_data.get('after_hours_price'))
quote.after_hours_change = _parse_float(quote_data.get('after_hours_change'))
quote.after_hours_change_percent = _parse_float(quote_data.get('after_hours_change_percent'))
quote.bid = _parse_float(quote_data.get('bid'))
quote.ask = _parse_float(quote_data.get('ask'))
quote.bid_ask_size = quote_data.get('bid_ask_size')
quote.previous_close = _parse_float(quote_data.get('previous_close'))
quote.open = _parse_float(quote_data.get('open'))
quote.volume = _parse_volume(quote_data.get('volume', ''))
quote.volume_vs_avg = quote_data.get('volume_vs_avg')
quote.day_range_low = _parse_float(quote_data.get('day_range_low'))
quote.day_range_high = _parse_float(quote_data.get('day_range_high'))
quote.week_52_low = _parse_float(quote_data.get('week_52_low'))
quote.week_52_high = _parse_float(quote_data.get('week_52_high'))
quote.market_cap = quote_data.get('market_cap')
# Try to extract sector and exchange from page header
header_data = await page.evaluate(r'''
() => {
const data = {};
// Look for sector near company name
const sectorElement = document.querySelector('[data-testid="sector"], .sector');
if (sectorElement) {
data.sector = sectorElement.textContent.replace('Sector', '').trim();
} else {
// Manual search for text containing "Sector"
const spans = Array.from(document.querySelectorAll('span'));
const sectorSpan = spans.find(el => el.textContent && el.textContent.includes('Sector'));
if (sectorSpan) {
data.sector = sectorSpan.textContent.replace('Sector', '').replace(':', '').trim();
}
}
// Look for exchange near ticker
const exchangeElement = document.querySelector('[data-testid="exchange"], .exchange');
if (exchangeElement) {
data.exchange = exchangeElement.textContent.trim();
}
// Fallback: parse from page text
const pageText = document.body.textContent || '';
if (!data.sector) {
const sectorMatch = pageText.match(/Sector[:\s]+([A-Za-z\s&]+)/);
if (sectorMatch) data.sector = sectorMatch[1].trim();
}
if (!data.exchange) {
const exchangeMatch = pageText.match(/(NYSE|NASDAQ|AMEX|OTC)/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
}
return data;
}
''')
quote.sector = header_data.get('sector')
quote.exchange = header_data.get('exchange')
if debug:
logger.debug(f"Extracted quote data: price={quote.price}, volume={quote.volume}, "
f"52w_range={quote.week_52_low}-{quote.week_52_high}")
except Exception as e:
if debug:
logger.debug(f"Error extracting quote data: {e}")
return quote
async def extract_enhanced_dividends(page, debug: bool = False) -> EnhancedDividends:
"""Extract enhanced dividend data including next payment dates.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EnhancedDividends object with extracted fields
"""
dividends = EnhancedDividends()
try:
if debug:
logger.debug("Starting enhanced dividend extraction...")
# Wait for dividends panel to load
await page.wait_for_selector('#dividends', timeout=15000)
# Scroll to dividends panel
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking dividends panel header to trigger content load...")
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title, #dividends-togglechevron-button')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked dividends panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click dividends header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Extract dividend data
dividend_data = await page.evaluate('''
() => {
const data = {};
const dividendsPanel = document.querySelector('#dividends');
if (!dividendsPanel) return data;
const fullText = dividendsPanel.textContent || '';
// DEBUG: Return sample of text for debugging
data._debug_text_sample = fullText.substring(0, 800);
// Next dividend payment
const nextPaymentMatch = fullText.match(/Next Dividend Payment\\s*\\$([0-9.]+)/i);
if (nextPaymentMatch) data.next_payment = nextPaymentMatch[1];
// Next pay date
const nextPayDateMatch = fullText.match(/Next Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextPayDateMatch) data.next_pay_date = nextPayDateMatch[1];
// Next ex-date
const nextExDateMatch = fullText.match(/Next Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextExDateMatch) data.next_ex_date = nextExDateMatch[1];
// Previous dividend payment
const prevPaymentMatch = fullText.match(/Previous Dividend Payment\\s*\\$([0-9.]+)/i);
if (prevPaymentMatch) data.previous_payment = prevPaymentMatch[1];
// Previous pay date
const prevPayDateMatch = fullText.match(/Previous Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevPayDateMatch) data.previous_pay_date = prevPayDateMatch[1];
// Previous ex-date
const prevExDateMatch = fullText.match(/Previous Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevExDateMatch) data.previous_ex_date = prevExDateMatch[1];
// Frequency
const frequencyMatch = fullText.match(/Frequency\\s*(Quarterly|Monthly|Annual|Semi-Annual)/i);
if (frequencyMatch) data.frequency = frequencyMatch[1];
// Annual Dividend Rate (IAD)
const annualRateMatch = fullText.match(/Annual Dividend Rate.*?\\$([0-9.]+)/i);
if (annualRateMatch) data.annual_rate = annualRateMatch[1];
// Annual Dividend Yield - appears after "Annual Dividend Yield" text
// Text pattern: "Annual Dividend Yield...2.71%"
const yieldMatch = fullText.match(/Annual Dividend Yield[\\s\\S]{0,300}?([0-9]+\\.[0-9]+)%/i);
if (yieldMatch) data.annual_yield = yieldMatch[1];
return data;
}
''')
if debug and dividend_data.get('_debug_text_sample'):
logger.debug(f"Dividend panel text sample: {dividend_data['_debug_text_sample']}")
# Parse and assign values
dividends.next_payment = _parse_float(dividend_data.get('next_payment'))
dividends.next_pay_date = dividend_data.get('next_pay_date')
dividends.next_ex_date = dividend_data.get('next_ex_date')
dividends.previous_payment = _parse_float(dividend_data.get('previous_payment'))
dividends.previous_pay_date = dividend_data.get('previous_pay_date')
dividends.previous_ex_date = dividend_data.get('previous_ex_date')
dividends.frequency = dividend_data.get('frequency')
dividends.annual_rate = _parse_float(dividend_data.get('annual_rate'))
dividends.annual_yield = _parse_float(dividend_data.get('annual_yield'))
if debug:
logger.debug(f"Extracted dividend data: next_payment={dividends.next_payment}, "
f"next_pay_date={dividends.next_pay_date}, annual_rate={dividends.annual_rate}")
except Exception as e:
if debug:
logger.debug(f"Error extracting dividend data: {e}")
return dividends
async def extract_earnings_data(page, debug: bool = False) -> EarningsData:
"""Extract earnings metrics and forecasts.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EarningsData object with extracted fields
"""
earnings = EarningsData()
try:
if debug:
logger.debug("Starting earnings data extraction...")
# Wait for earnings panel to load
await page.wait_for_selector('#expected-earnings', timeout=15000)
# Scroll to earnings panel
await page.evaluate('''
() => {
const earningsPanel = document.querySelector('#expected-earnings');
if (earningsPanel) {
earningsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking earnings panel header to trigger content load...")
try:
earnings_header = await page.query_selector('#expected-earnings h2, #expected-earnings .sdps-panel__title, #expected-earnings-heading, #expected-earnings-togglechevron-button')
if earnings_header:
await earnings_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked earnings panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click earnings header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Check for and click "Show More" if present
try:
# Use JS to find and click - most robust way
clicked = await page.evaluate('''
() => {
const panel = document.querySelector('#expected-earnings');
if (!panel) return false;
// Find any element with "Show More" text
const elements = Array.from(panel.querySelectorAll('a, button, span, div'));
const showMore = elements.find(el => el.textContent.trim().toLowerCase() === "show more");
if (showMore) {
showMore.click();
return true;
}
return false;
}
''')
if clicked:
if debug:
logger.debug("found and clicked 'Show More' via JS")
await page.wait_for_timeout(2000)
elif debug:
logger.debug("'Show More' not found or not clickable")
except Exception as e:
if debug:
logger.debug(f"Error checking for Show More: {e}")
# Extract earnings data
earnings_data = await page.evaluate(r'''
(debug) => {
const data = {};
// Helper to get text content including Shadow DOMs
const getDeepText = (root) => {
if (!root) return '';
if (root.nodeType === Node.TEXT_NODE) return root.textContent;
if (root.nodeType === Node.ELEMENT_NODE && root.shadowRoot) {
return getDeepText(root.shadowRoot);
}
let text = '';
const children = root.childNodes;
for (let i = 0; i < children.length; i++) {
text += getDeepText(children[i]);
}
return text;
};
const earningsPanel = document.querySelector('#expected-earnings');
let fullText = '';
if (earningsPanel) {
fullText = getDeepText(earningsPanel);
}
// Fallback to body deep text if panel seems empty
if (fullText.length < 500 || !fullText.includes("Announcement")) {
fullText = getDeepText(document.body);
}
// Next earnings announcement - robust regex checking for various patterns
let nextAnnouncementMatch = fullText.match(/Next Earnings Announcement.*?([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
if (!nextAnnouncementMatch) {
// Try alternate pattern: Announcement: 12/12/2025
nextAnnouncementMatch = fullText.match(/Announcement:?\s*([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
}
if (nextAnnouncementMatch) data.next_announcement_date = nextAnnouncementMatch[1];
// Announcement timing
const timingMatch = fullText.match(/(Before Market Open|After Market Close)/i);
if (timingMatch) data.announcement_timing = timingMatch[1];
// Number of analysts
const analystsMatch = fullText.match(/With ([0-9]+) analysts covering/i);
if (analystsMatch) data.analysts_covering = analystsMatch[1];
// Consensus estimate
const consensusMatch = fullText.match(/consensus.*?estimate is \\$([0-9.]+)/i);
if (consensusMatch) data.consensus_estimate = consensusMatch[1];
// High/Low estimates
const highLowMatch = fullText.match(/high and low estimates are \\$([0-9.]+) and \\$([0-9.]+)/i);
if (highLowMatch) {
data.estimate_high = highLowMatch[1];
data.estimate_low = highLowMatch[2];
}
// EPS TTM (multiple patterns)
let epsMatch = fullText.match(/EPS\s*\(TTM\)\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/Earnings per Share\s*\(?TTM\)?\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/EPS\s+(?:Value)?\s*([0-9.-]+)/i);
if (epsMatch) data.eps_ttm = epsMatch[1];
// Revenue TTM
let revenueMatch = fullText.match(/Revenue\s*\(TTM\)\s*(?:Value)?\s*\$([0-9.]+[KMBT]?)/i);
if (!revenueMatch) revenueMatch = fullText.match(/Revenue\s+(?:Value)?\s*\$([0-9.]+[KMBT])/i);
if (revenueMatch) data.revenue_ttm = revenueMatch[1];
// P/E TTM (multiple patterns)
let peMatch = fullText.match(/Price[\/\s]*Earnings\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/P[\/\s]*E\s*\(?TTM\)?\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/PE Ratio\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (peMatch) data.pe_ttm = peMatch[1];
// Forward P/E
let forwardPeMatch = fullText.match(/Forward\s+P[\/\s]*E\s*(?:Value)?\s*([0-9.]+)/i);
if (!forwardPeMatch) forwardPeMatch = fullText.match(/P[\/\s]*E\s*\(Forward\)\s*(?:Value)?\s*([0-9.]+)/i);
if (forwardPeMatch) data.forward_pe = forwardPeMatch[1];
// PEG Ratio
let pegMatch = fullText.match(/Price\s+to\s+Earnings[\/\s]*Growth\s*\(PEG\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!pegMatch) pegMatch = fullText.match(/PEG\s*Ratio?\s*(?:Value)?\s*([0-9.]+)/i);
if (pegMatch) data.peg_ratio = pegMatch[1];
// Recent beats/misses (simplified - just extract beat amounts)
const beatMatches = fullText.matchAll(/Beat.*?\$([0-9.]+)/gi);
data.recent_beats = [];
for (const match of beatMatches) {
data.recent_beats.push(match[1]);
}
return data;
}
''', debug)
# Parse and assign values
earnings.next_announcement_date = earnings_data.get('next_announcement_date')
earnings.announcement_timing = earnings_data.get('announcement_timing')
earnings.analysts_covering = _parse_int(earnings_data.get('analysts_covering'))
earnings.consensus_estimate = _parse_float(earnings_data.get('consensus_estimate'))
earnings.estimate_high = _parse_float(earnings_data.get('estimate_high'))
earnings.estimate_low = _parse_float(earnings_data.get('estimate_low'))
earnings.eps_ttm = _parse_float(earnings_data.get('eps_ttm'))
earnings.revenue_ttm = _parse_revenue(earnings_data.get('revenue_ttm', ''))
earnings.pe_ttm = _parse_float(earnings_data.get('pe_ttm'))
earnings.forward_pe = _parse_float(earnings_data.get('forward_pe'))
earnings.peg_ratio = _parse_float(earnings_data.get('peg_ratio'))
# Store recent beats as list of dicts
if earnings_data.get('recent_beats'):
earnings.recent_beats = [
{'beat_amount': _parse_float(beat)}
for beat in earnings_data.get('recent_beats', [])
]
if debug:
logger.debug(f"Extracted earnings data: eps_ttm={earnings.eps_ttm}, "
f"pe_ttm={earnings.pe_ttm}, forward_pe={earnings.forward_pe}")
except Exception as e:
if debug:
logger.debug(f"Error extracting earnings data: {e}")
return earnings
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
Args:
annual_dividend: Annual dividend rate per share
eps_ttm: Earnings per share (trailing twelve months)
Returns:
Payout ratio as percentage, or None if cannot calculate
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def extract_phase1_data(page, debug: bool = False) -> EquityPhase1Data:
"""Extract all Phase 1 data points.
Args:
page: Playwright page object
debug: Enable debug output
Returns:
EquityPhase1Data object with all extracted data
"""
if debug:
logger.debug("Starting Phase 1 data extraction...")
# Wait for page to stabilize
await page.wait_for_timeout(3000)
# Extract ticker from page URL
ticker = await page.evaluate('''
() => {
const url = window.location.href;
const match = url.match(/stocks\\/([A-Z]+)/i);
return match ? match[1].toUpperCase() : '';
}
''')
# Extract each section
quote = await extract_quote_data(page, ticker=ticker, debug=debug)
dividends = await extract_enhanced_dividends(page, debug=debug)
earnings = await extract_earnings_data(page, debug=debug)
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"Phase 1 extraction complete for {ticker}")
return phase1_data