Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s

This commit is contained in:
2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions

View File

@@ -0,0 +1,786 @@
"""Phase 1: Essential Dividend Metrics Implementation (DEPRECATED)
⚠️ DEPRECATED: This DOM-scraping based approach has been replaced by phase1_api_scraper.py
which uses Schwab's REST APIs directly. The API approach is more reliable, complete,
and maintainable than DOM scraping.
This module is kept for reference only. New code should use phase1_api_scraper.py.
Old approach extracts from DOM:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
"""
from typing import Dict, Any, Optional
import re
import logging
from ...core import QuoteData, EnhancedDividends, EarningsData, CalculatedMetrics, EquityPhase1Data
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float, handling $ and % symbols."""
if value is None:
return None
try:
# Remove common formatting characters
clean = str(value).strip().replace('$', '').replace(',', '').replace('%', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return float(clean)
except (ValueError, AttributeError):
pass
return None
def _parse_int(value: Any) -> Optional[int]:
"""Safely parse a value to int."""
if value is None:
return None
try:
clean = str(value).strip().replace(',', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return int(float(clean))
except (ValueError, AttributeError):
pass
return None
def _parse_volume(volume_str: str) -> Optional[int]:
"""Parse volume string like '8M', '22.4M', '1.2B' to integer."""
if not volume_str:
return None
try:
volume_str = volume_str.strip().upper()
multiplier = 1
if volume_str.endswith('K'):
multiplier = 1_000
volume_str = volume_str[:-1]
elif volume_str.endswith('M'):
multiplier = 1_000_000
volume_str = volume_str[:-1]
elif volume_str.endswith('B'):
multiplier = 1_000_000_000
volume_str = volume_str[:-1]
value = float(volume_str)
return int(value * multiplier)
except (ValueError, AttributeError):
return None
def _parse_revenue(revenue_str: str) -> Optional[float]:
"""Parse revenue string like '$92.15B', '$1.5M' to dollar value."""
if not revenue_str:
return None
try:
revenue_str = revenue_str.strip().upper().replace('$', '').replace(',', '')
multiplier = 1
if revenue_str.endswith('K'):
multiplier = 1_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('M'):
multiplier = 1_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('B'):
multiplier = 1_000_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('T'):
multiplier = 1_000_000_000_000
revenue_str = revenue_str[:-1]
value = float(revenue_str)
return value * multiplier
except (ValueError, AttributeError):
return None
async def extract_quote_data(page, ticker: str = "", debug: bool = False) -> QuoteData:
"""Extract quote/price data from symbol bar.
Args:
page: Playwright page object
ticker: Stock ticker symbol (for pattern matching)
debug: Enable debug logging
Returns:
QuoteData object with extracted fields
"""
quote = QuoteData()
try:
if debug:
logger.debug("Starting quote data extraction...")
# Wait for symbol bar content (look for key labels)
try:
await page.wait_for_selector('#app-symbol-bar-component, text=Previous close', state='attached', timeout=15000)
except Exception:
if debug:
logger.debug("Timeout waiting for symbol bar selector, attempting to parse whatever is there")
# Extract symbol bar text content (fallback to body if specific component not found)
symbol_bar_text = await page.evaluate('''
() => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) return symbolBar.textContent;
// If specific component not found, try to find the container with market data
// Look for container with "Previous close"
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
// Return the parent's text content (go up a few levels to capture all data)
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
return document.body.textContent || '';
}
''')
if debug:
logger.debug(f"Symbol bar text (first 500 chars): {symbol_bar_text[:500]}")
# Extract structured data
quote_data = await page.evaluate(r'''
(ticker) => {
const data = {};
// Helper to get text content from page
const getText = () => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
// Verify it looks like the right component by checking for "Previous close"
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) {
return symbolBar.textContent;
}
// Fallback logic
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
// Last resort: body text
return document.body.textContent || '';
};
const fullText = getText();
// Try to find price in quote container first for accuracy
const priceElement = document.querySelector('.symbol-quote-container, [data-testid="quote-price"]');
if (priceElement) {
const priceText = priceElement.textContent || '';
const priceMatch = priceText.match(/\$([0-9,]+\.[0-9]+)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
} else {
// Fallback regex for price if element not found
// Look for price near top or just regex
const priceMatch = fullText.match(/\$([0-9,]+\.[0-9]{2})(\s|[+-]|$)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
}
// After hours (using \s* for robustness)
const afterHoursMatch = fullText.match(/After hours:?\s*\$([0-9,.]+)/i);
if (afterHoursMatch) data.after_hours_price = afterHoursMatch[1].replace(',', '');
const afterHoursChangeMatch = fullText.match(/After hours:.*?([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
if (afterHoursChangeMatch) {
data.after_hours_change = afterHoursChangeMatch[1].replace('$', '').replace(',', '');
data.after_hours_change_percent = afterHoursChangeMatch[2];
}
// Bid/Ask (using \s* for robustness)
const bidMatch = fullText.match(/Bid\s*\$([0-9,.]+)/i);
if (bidMatch) data.bid = bidMatch[1].replace(',', '');
const askMatch = fullText.match(/Ask\s*\$([0-9,.]+)/i);
if (askMatch) data.ask = askMatch[1].replace(',', '');
const bidAskSizeMatch = fullText.match(/Bid\/Ask Size\s*([0-9]+\/[0-9]+)/i);
if (bidAskSizeMatch) data.bid_ask_size = bidAskSizeMatch[1];
// Previous close and open (using \s* instead of \s+)
const prevCloseMatch = fullText.match(/Previous close\s*\$([0-9,.]+)/i);
if (prevCloseMatch) data.previous_close = prevCloseMatch[1].replace(',', '');
const openMatch = fullText.match(/Today's open\s*\$([0-9,.]+)/i);
if (openMatch) data.open = openMatch[1].replace(',', '');
// Volume (using \s*)
const volumeMatch = fullText.match(/Today's volume\s*([0-9.]+[KMB]?)/i);
if (volumeMatch) data.volume = volumeMatch[1];
const volumeVsAvgMatch = fullText.match(/Today's volume\s*[0-9.]+[KMB]?\s*(Above Avg\.|Below Avg\.|Average)/i);
if (volumeVsAvgMatch) data.volume_vs_avg = volumeVsAvgMatch[1];
// Day range
// Pattern: "Today's range low $200.81 Today's range high $203.45" or similar
// We'll look for "low $X" and "high $Y" appearing after "Today's range"
const dayRangeMatch = fullText.match(/Today's range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (dayRangeMatch) {
data.day_range_low = dayRangeMatch[1].replace(',', '');
data.day_range_high = dayRangeMatch[2].replace(',', '');
}
// 52-week range
const weekRangeMatch = fullText.match(/52-week range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (weekRangeMatch) {
data.week_52_low = weekRangeMatch[1].replace(',', '');
data.week_52_high = weekRangeMatch[2].replace(',', '');
}
// Market cap (may be in Share Profile section)
const marketCapMatch = fullText.match(/Market Cap\s*\$([0-9.]+[KMBT])/i);
if (marketCapMatch) data.market_cap = marketCapMatch[1];
// Change and change percent
// Try specific formatted pattern first: TICKER $PRICE CHANGE CHANGE%
// e.g. "JNJ $201.95 -1.03 -0.51%"
const standardPattern = fullText.match(/\$([0-9,.]+)\s*([+-]?[0-9,.]+)\s*([+-]?[0-9.]+)%/);
if (standardPattern) {
if (!data.price) data.price = standardPattern[1].replace(',', '');
data.change = standardPattern[2];
data.change_percent = standardPattern[3];
}
let percentMatch = null;
if (ticker && !data.change_percent) {
// Match: TICKER$digits.digits{2}percent%
const tickerPattern = new RegExp(ticker + '\\\\.?[\\s]*\\$([0-9,]+\\\\.[0-9]{2})[\\s]*([0-9.]+)%', 'i');
percentMatch = fullText.match(tickerPattern);
if (percentMatch) {
data.change_percent = percentMatch[2];
}
}
if (!data.change_percent) {
// Fallback: match any price+percent pattern with space
const fallbackMatch = fullText.match(/\$[0-9,.]+\s*([+-]?[0-9.]+)%/);
if (fallbackMatch) {
data.change_percent = fallbackMatch[1];
}
}
// Pattern 2: "+$1.23 (+0.45%)" or "-$1.23 (-0.45%)"
let changeMatch = fullText.match(/([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/);
// Pattern 3: "$193.08 +1.23 +0.64%" (price followed by change)
if (!changeMatch) {
changeMatch = fullText.match(/\$[0-9,.]+\s*([+-][0-9,.]+)\s*([+-][0-9.]+)%/);
}
// Pattern 4: "Change: +1.23 (+0.64%)"
if (!changeMatch) {
changeMatch = fullText.match(/Change:?\s*([+-][0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
}
if (changeMatch) {
data.change = changeMatch[1].replace('$', '').replace(',', '');
if (!data.change_percent) {
data.change_percent = changeMatch[2].replace(/[+]/g, '');
}
}
// Exchange - look for NYSE, NASDAQ, etc.
const exchangeMatch = fullText.match(/\b(NYSE|NASDAQ|AMEX|OTC|BATS)\b/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
return data;
}
''', ticker)
# Parse and assign values
quote.price = _parse_float(quote_data.get('price'))
quote.change = _parse_float(quote_data.get('change'))
quote.change_percent = _parse_float(quote_data.get('change_percent'))
quote.after_hours_price = _parse_float(quote_data.get('after_hours_price'))
quote.after_hours_change = _parse_float(quote_data.get('after_hours_change'))
quote.after_hours_change_percent = _parse_float(quote_data.get('after_hours_change_percent'))
quote.bid = _parse_float(quote_data.get('bid'))
quote.ask = _parse_float(quote_data.get('ask'))
quote.bid_ask_size = quote_data.get('bid_ask_size')
quote.previous_close = _parse_float(quote_data.get('previous_close'))
quote.open = _parse_float(quote_data.get('open'))
quote.volume = _parse_volume(quote_data.get('volume', ''))
quote.volume_vs_avg = quote_data.get('volume_vs_avg')
quote.day_range_low = _parse_float(quote_data.get('day_range_low'))
quote.day_range_high = _parse_float(quote_data.get('day_range_high'))
quote.week_52_low = _parse_float(quote_data.get('week_52_low'))
quote.week_52_high = _parse_float(quote_data.get('week_52_high'))
quote.market_cap = quote_data.get('market_cap')
# Try to extract sector and exchange from page header
header_data = await page.evaluate(r'''
() => {
const data = {};
// Look for sector near company name
const sectorElement = document.querySelector('[data-testid="sector"], .sector');
if (sectorElement) {
data.sector = sectorElement.textContent.replace('Sector', '').trim();
} else {
// Manual search for text containing "Sector"
const spans = Array.from(document.querySelectorAll('span'));
const sectorSpan = spans.find(el => el.textContent && el.textContent.includes('Sector'));
if (sectorSpan) {
data.sector = sectorSpan.textContent.replace('Sector', '').replace(':', '').trim();
}
}
// Look for exchange near ticker
const exchangeElement = document.querySelector('[data-testid="exchange"], .exchange');
if (exchangeElement) {
data.exchange = exchangeElement.textContent.trim();
}
// Fallback: parse from page text
const pageText = document.body.textContent || '';
if (!data.sector) {
const sectorMatch = pageText.match(/Sector[:\s]+([A-Za-z\s&]+)/);
if (sectorMatch) data.sector = sectorMatch[1].trim();
}
if (!data.exchange) {
const exchangeMatch = pageText.match(/(NYSE|NASDAQ|AMEX|OTC)/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
}
return data;
}
''')
quote.sector = header_data.get('sector')
quote.exchange = header_data.get('exchange')
if debug:
logger.debug(f"Extracted quote data: price={quote.price}, volume={quote.volume}, "
f"52w_range={quote.week_52_low}-{quote.week_52_high}")
except Exception as e:
if debug:
logger.debug(f"Error extracting quote data: {e}")
return quote
async def extract_enhanced_dividends(page, debug: bool = False) -> EnhancedDividends:
"""Extract enhanced dividend data including next payment dates.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EnhancedDividends object with extracted fields
"""
dividends = EnhancedDividends()
try:
if debug:
logger.debug("Starting enhanced dividend extraction...")
# Wait for dividends panel to load
await page.wait_for_selector('#dividends', timeout=15000)
# Scroll to dividends panel
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking dividends panel header to trigger content load...")
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title, #dividends-togglechevron-button')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked dividends panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click dividends header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Extract dividend data
dividend_data = await page.evaluate('''
() => {
const data = {};
const dividendsPanel = document.querySelector('#dividends');
if (!dividendsPanel) return data;
const fullText = dividendsPanel.textContent || '';
// DEBUG: Return sample of text for debugging
data._debug_text_sample = fullText.substring(0, 800);
// Next dividend payment
const nextPaymentMatch = fullText.match(/Next Dividend Payment\\s*\\$([0-9.]+)/i);
if (nextPaymentMatch) data.next_payment = nextPaymentMatch[1];
// Next pay date
const nextPayDateMatch = fullText.match(/Next Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextPayDateMatch) data.next_pay_date = nextPayDateMatch[1];
// Next ex-date
const nextExDateMatch = fullText.match(/Next Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextExDateMatch) data.next_ex_date = nextExDateMatch[1];
// Previous dividend payment
const prevPaymentMatch = fullText.match(/Previous Dividend Payment\\s*\\$([0-9.]+)/i);
if (prevPaymentMatch) data.previous_payment = prevPaymentMatch[1];
// Previous pay date
const prevPayDateMatch = fullText.match(/Previous Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevPayDateMatch) data.previous_pay_date = prevPayDateMatch[1];
// Previous ex-date
const prevExDateMatch = fullText.match(/Previous Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevExDateMatch) data.previous_ex_date = prevExDateMatch[1];
// Frequency
const frequencyMatch = fullText.match(/Frequency\\s*(Quarterly|Monthly|Annual|Semi-Annual)/i);
if (frequencyMatch) data.frequency = frequencyMatch[1];
// Annual Dividend Rate (IAD)
const annualRateMatch = fullText.match(/Annual Dividend Rate.*?\\$([0-9.]+)/i);
if (annualRateMatch) data.annual_rate = annualRateMatch[1];
// Annual Dividend Yield - appears after "Annual Dividend Yield" text
// Text pattern: "Annual Dividend Yield...2.71%"
const yieldMatch = fullText.match(/Annual Dividend Yield[\\s\\S]{0,300}?([0-9]+\\.[0-9]+)%/i);
if (yieldMatch) data.annual_yield = yieldMatch[1];
return data;
}
''')
if debug and dividend_data.get('_debug_text_sample'):
logger.debug(f"Dividend panel text sample: {dividend_data['_debug_text_sample']}")
# Parse and assign values
dividends.next_payment = _parse_float(dividend_data.get('next_payment'))
dividends.next_pay_date = dividend_data.get('next_pay_date')
dividends.next_ex_date = dividend_data.get('next_ex_date')
dividends.previous_payment = _parse_float(dividend_data.get('previous_payment'))
dividends.previous_pay_date = dividend_data.get('previous_pay_date')
dividends.previous_ex_date = dividend_data.get('previous_ex_date')
dividends.frequency = dividend_data.get('frequency')
dividends.annual_rate = _parse_float(dividend_data.get('annual_rate'))
dividends.annual_yield = _parse_float(dividend_data.get('annual_yield'))
if debug:
logger.debug(f"Extracted dividend data: next_payment={dividends.next_payment}, "
f"next_pay_date={dividends.next_pay_date}, annual_rate={dividends.annual_rate}")
except Exception as e:
if debug:
logger.debug(f"Error extracting dividend data: {e}")
return dividends
async def extract_earnings_data(page, debug: bool = False) -> EarningsData:
"""Extract earnings metrics and forecasts.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EarningsData object with extracted fields
"""
earnings = EarningsData()
try:
if debug:
logger.debug("Starting earnings data extraction...")
# Wait for earnings panel to load
await page.wait_for_selector('#expected-earnings', timeout=15000)
# Scroll to earnings panel
await page.evaluate('''
() => {
const earningsPanel = document.querySelector('#expected-earnings');
if (earningsPanel) {
earningsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking earnings panel header to trigger content load...")
try:
earnings_header = await page.query_selector('#expected-earnings h2, #expected-earnings .sdps-panel__title, #expected-earnings-heading, #expected-earnings-togglechevron-button')
if earnings_header:
await earnings_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked earnings panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click earnings header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Check for and click "Show More" if present
try:
# Use JS to find and click - most robust way
clicked = await page.evaluate('''
() => {
const panel = document.querySelector('#expected-earnings');
if (!panel) return false;
// Find any element with "Show More" text
const elements = Array.from(panel.querySelectorAll('a, button, span, div'));
const showMore = elements.find(el => el.textContent.trim().toLowerCase() === "show more");
if (showMore) {
showMore.click();
return true;
}
return false;
}
''')
if clicked:
if debug:
logger.debug("found and clicked 'Show More' via JS")
await page.wait_for_timeout(2000)
elif debug:
logger.debug("'Show More' not found or not clickable")
except Exception as e:
if debug:
logger.debug(f"Error checking for Show More: {e}")
# Extract earnings data
earnings_data = await page.evaluate(r'''
(debug) => {
const data = {};
// Helper to get text content including Shadow DOMs
const getDeepText = (root) => {
if (!root) return '';
if (root.nodeType === Node.TEXT_NODE) return root.textContent;
if (root.nodeType === Node.ELEMENT_NODE && root.shadowRoot) {
return getDeepText(root.shadowRoot);
}
let text = '';
const children = root.childNodes;
for (let i = 0; i < children.length; i++) {
text += getDeepText(children[i]);
}
return text;
};
const earningsPanel = document.querySelector('#expected-earnings');
let fullText = '';
if (earningsPanel) {
fullText = getDeepText(earningsPanel);
}
// Fallback to body deep text if panel seems empty
if (fullText.length < 500 || !fullText.includes("Announcement")) {
fullText = getDeepText(document.body);
}
// Next earnings announcement - robust regex checking for various patterns
let nextAnnouncementMatch = fullText.match(/Next Earnings Announcement.*?([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
if (!nextAnnouncementMatch) {
// Try alternate pattern: Announcement: 12/12/2025
nextAnnouncementMatch = fullText.match(/Announcement:?\s*([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
}
if (nextAnnouncementMatch) data.next_announcement_date = nextAnnouncementMatch[1];
// Announcement timing
const timingMatch = fullText.match(/(Before Market Open|After Market Close)/i);
if (timingMatch) data.announcement_timing = timingMatch[1];
// Number of analysts
const analystsMatch = fullText.match(/With ([0-9]+) analysts covering/i);
if (analystsMatch) data.analysts_covering = analystsMatch[1];
// Consensus estimate
const consensusMatch = fullText.match(/consensus.*?estimate is \\$([0-9.]+)/i);
if (consensusMatch) data.consensus_estimate = consensusMatch[1];
// High/Low estimates
const highLowMatch = fullText.match(/high and low estimates are \\$([0-9.]+) and \\$([0-9.]+)/i);
if (highLowMatch) {
data.estimate_high = highLowMatch[1];
data.estimate_low = highLowMatch[2];
}
// EPS TTM (multiple patterns)
let epsMatch = fullText.match(/EPS\s*\(TTM\)\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/Earnings per Share\s*\(?TTM\)?\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/EPS\s+(?:Value)?\s*([0-9.-]+)/i);
if (epsMatch) data.eps_ttm = epsMatch[1];
// Revenue TTM
let revenueMatch = fullText.match(/Revenue\s*\(TTM\)\s*(?:Value)?\s*\$([0-9.]+[KMBT]?)/i);
if (!revenueMatch) revenueMatch = fullText.match(/Revenue\s+(?:Value)?\s*\$([0-9.]+[KMBT])/i);
if (revenueMatch) data.revenue_ttm = revenueMatch[1];
// P/E TTM (multiple patterns)
let peMatch = fullText.match(/Price[\/\s]*Earnings\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/P[\/\s]*E\s*\(?TTM\)?\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/PE Ratio\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (peMatch) data.pe_ttm = peMatch[1];
// Forward P/E
let forwardPeMatch = fullText.match(/Forward\s+P[\/\s]*E\s*(?:Value)?\s*([0-9.]+)/i);
if (!forwardPeMatch) forwardPeMatch = fullText.match(/P[\/\s]*E\s*\(Forward\)\s*(?:Value)?\s*([0-9.]+)/i);
if (forwardPeMatch) data.forward_pe = forwardPeMatch[1];
// PEG Ratio
let pegMatch = fullText.match(/Price\s+to\s+Earnings[\/\s]*Growth\s*\(PEG\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!pegMatch) pegMatch = fullText.match(/PEG\s*Ratio?\s*(?:Value)?\s*([0-9.]+)/i);
if (pegMatch) data.peg_ratio = pegMatch[1];
// Recent beats/misses (simplified - just extract beat amounts)
const beatMatches = fullText.matchAll(/Beat.*?\$([0-9.]+)/gi);
data.recent_beats = [];
for (const match of beatMatches) {
data.recent_beats.push(match[1]);
}
return data;
}
''', debug)
# Parse and assign values
earnings.next_announcement_date = earnings_data.get('next_announcement_date')
earnings.announcement_timing = earnings_data.get('announcement_timing')
earnings.analysts_covering = _parse_int(earnings_data.get('analysts_covering'))
earnings.consensus_estimate = _parse_float(earnings_data.get('consensus_estimate'))
earnings.estimate_high = _parse_float(earnings_data.get('estimate_high'))
earnings.estimate_low = _parse_float(earnings_data.get('estimate_low'))
earnings.eps_ttm = _parse_float(earnings_data.get('eps_ttm'))
earnings.revenue_ttm = _parse_revenue(earnings_data.get('revenue_ttm', ''))
earnings.pe_ttm = _parse_float(earnings_data.get('pe_ttm'))
earnings.forward_pe = _parse_float(earnings_data.get('forward_pe'))
earnings.peg_ratio = _parse_float(earnings_data.get('peg_ratio'))
# Store recent beats as list of dicts
if earnings_data.get('recent_beats'):
earnings.recent_beats = [
{'beat_amount': _parse_float(beat)}
for beat in earnings_data.get('recent_beats', [])
]
if debug:
logger.debug(f"Extracted earnings data: eps_ttm={earnings.eps_ttm}, "
f"pe_ttm={earnings.pe_ttm}, forward_pe={earnings.forward_pe}")
except Exception as e:
if debug:
logger.debug(f"Error extracting earnings data: {e}")
return earnings
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
Args:
annual_dividend: Annual dividend rate per share
eps_ttm: Earnings per share (trailing twelve months)
Returns:
Payout ratio as percentage, or None if cannot calculate
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def extract_phase1_data(page, debug: bool = False) -> EquityPhase1Data:
"""Extract all Phase 1 data points.
Args:
page: Playwright page object
debug: Enable debug output
Returns:
EquityPhase1Data object with all extracted data
"""
if debug:
logger.debug("Starting Phase 1 data extraction...")
# Wait for page to stabilize
await page.wait_for_timeout(3000)
# Extract ticker from page URL
ticker = await page.evaluate('''
() => {
const url = window.location.href;
const match = url.match(/stocks\\/([A-Z]+)/i);
return match ? match[1].toUpperCase() : '';
}
''')
# Extract each section
quote = await extract_quote_data(page, ticker=ticker, debug=debug)
dividends = await extract_enhanced_dividends(page, debug=debug)
earnings = await extract_earnings_data(page, debug=debug)
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"Phase 1 extraction complete for {ticker}")
return phase1_data