Files
schwab-mcp-custom/schwab_scraper/features/equity/scraper.py
b3nw 650ea2d087
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
Fix build: Bundle schwab_scraper source and use local dependencies
2026-04-24 01:50:20 +00:00

978 lines
52 KiB
Python

from typing import Dict, Any, Optional
from ...utils.logging import save_debug_artifact
def should_replace_dividend_value(existing_value: Optional[str], new_value: Optional[str]) -> bool:
"""
Decide whether to replace an existing dividend field value with a new one.
Rules:
- Never replace with empty/None values
- Replace if there is no existing value
- Replace if the existing value is "Show More" or contains "Show More"
- Otherwise, keep the existing (good) data
"""
if not new_value or not str(new_value).strip():
return False
if not existing_value:
return True
existing_text = str(existing_value)
if existing_text == 'Show More' or 'Show More' in existing_text:
return True
return False
async def extract_dividend_data(page, debug: bool = False) -> Dict[str, Any]:
"""
Extract dividend information from Schwab stock page.
Returns dictionary with dividend data fields.
"""
dividend_data: Dict[str, Any] = {}
try:
if debug:
print("DEBUG: Starting dividend data extraction...")
# Take initial screenshot to see page state
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_start.png", png)
print(f"DEBUG: Initial screenshot saved as {path}")
# Wait for the dividends section to load dynamically
if debug:
print("DEBUG: Waiting for dividends section to load...")
try:
# First wait for the dividends panel to appear
await page.wait_for_selector('#dividends', timeout=15000)
if debug:
print("DEBUG: #dividends panel found")
# Wait for dividend content to load dynamically
dividend_loaded = False
max_attempts = 5 # Reduced from 10 for faster tests
attempt = 0
while not dividend_loaded and attempt < max_attempts:
attempt += 1
if debug:
print(f"DEBUG: Attempt {attempt}/{max_attempts} - Waiting for dynamic dividend content...")
# Check if the dividends section has been populated with actual content
dividend_status = await page.evaluate('''
() => {
const result = { loaded: false, debug: {} };
// Look for the dividends panel content that should be populated
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const textContent = panelBody.textContent || '';
result.debug.panelBodyLength = textContent.length;
result.debug.panelBodySample = textContent.substring(0, 200);
// Check if the panel has been populated with actual dividend text
// (not just empty comments)
const hasRealContent = textContent.length > 50 && (
textContent.includes('Previous Dividend') ||
textContent.includes('Pay Date') ||
textContent.includes('Ex-Date') ||
textContent.includes('Frequency') ||
textContent.includes('Annual Dividend') ||
textContent.includes('$') ||
textContent.includes('%')
);
if (hasRealContent) {
result.loaded = true;
return result;
}
}
}
// Alternative: check for stock-dividends component
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
const text = stockDividends.textContent || '';
result.debug.stockDividendsLength = text.length;
result.debug.stockDividendsSample = text.substring(0, 100);
if (text.length > 20 && text.includes('$')) {
result.loaded = true;
return result;
}
}
// Alternative: check for any elements with dividend-related content
const allElements = document.querySelectorAll('#dividends *');
result.debug.totalElements = allElements.length;
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.includes('Previous Dividend Payment') ||
(text.includes('$') && text.includes('.'))) {
result.loaded = true;
result.debug.foundInElement = elem.tagName + '.' + elem.className;
return result;
}
}
return result;
}
''')
if debug:
print(f"DEBUG: Dividend status: {dividend_status}")
dividend_loaded = dividend_status.get('loaded', False)
if dividend_loaded:
if debug:
print("DEBUG: Dynamic dividend content loaded!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_content_loaded.png", png)
print(f"DEBUG: Screenshot after content loaded: {path}")
break
# Wait between attempts to allow for async loading
await page.wait_for_timeout(1000) # Reduced from 2000ms for faster tests
if not dividend_loaded:
if debug:
print("DEBUG: Basic dividend content did not auto-load - this suggests the page is not behaving as expected")
print("DEBUG: Expected behavior: Basic dividend info should be visible without clicking 'Show More'")
# Try to force a page refresh or trigger loading
print("DEBUG: Attempting to trigger dividend content loading...")
try:
# Try scrolling to the dividend section to trigger lazy loading
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(3000)
# Try clicking on the dividends panel header to ensure it's active
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
print("DEBUG: Clicked on dividends panel header")
except:
pass
# Check one more time if content loaded
final_status = await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const textContent = panelBody.textContent || '';
return {
length: textContent.length,
sample: textContent.substring(0, 500),
hasBasicData: textContent.includes('$') && (
textContent.includes('Previous') ||
textContent.includes('Pay Date') ||
textContent.includes('Ex-Date')
)
};
}
}
return { length: 0, sample: '', hasBasicData: false };
}
''')
if debug:
print(f"DEBUG: Final dividend panel status: {final_status}")
if final_status.get('hasBasicData'):
print("DEBUG: Basic dividend data now detected after manual triggering!")
dividend_loaded = True
# Extract the data immediately while it's loaded
immediate_extraction = await page.evaluate(r'''
() => {
const results = {};
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const fullText = panelBody.textContent || '';
// Extract data using pattern matching from the full text
const patterns = {
'Previous Dividend Payment': /Previous Dividend Payment\s*\$([0-9]+\.[0-9]+)/,
'Previous Pay Date': /Previous Pay Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/,
'Previous Ex-Date': /Previous Ex-Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/,
'Frequency': /Frequency\s*([A-Za-z]+)/,
'Annual Dividend Rate': /(?:Annual Dividend Rate|IAD).*?\$([0-9]+\.[0-9]+)/,
'Annual Dividend Yield': /([0-9]+\.[0-9]+%)(?=\s|Annual|$)/
};
for (const [field, pattern] of Object.entries(patterns)) {
const match = fullText.match(pattern);
if (match) {
if (field === 'Previous Dividend Payment' || field === 'Annual Dividend Rate') {
results[field] = '$' + match[1];
} else {
results[field] = match[1];
}
}
}
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Immediate extraction results: {immediate_extraction}")
if immediate_extraction:
dividend_data.update(immediate_extraction)
# Clean up the Frequency field if it has extra text
if 'Frequency' in dividend_data and 'Quarterly' in dividend_data['Frequency']:
dividend_data['Frequency'] = 'Quarterly'
except Exception as e:
if debug:
print(f"DEBUG: Error during manual triggering: {e}")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_timeout.png", png)
print(f"DEBUG: Screenshot after timeout: {path}")
except Exception as e:
if debug:
print(f"DEBUG: Error waiting for dividend content: {e}")
# Check for dividend grid directly without clicking
if debug:
print("DEBUG: Checking for #dividend-grid...")
dividend_grid_found = False
try:
await page.wait_for_selector('#dividend-grid', timeout=10000)
dividend_grid_found = True
if debug:
print("DEBUG: #dividend-grid found!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_grid_found.png", png)
print(f"DEBUG: Screenshot with dividend grid: {path}")
except:
if debug:
print("DEBUG: #dividend-grid not found initially")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_no_grid.png", png)
print(f"DEBUG: Screenshot without grid: {path}")
# Try to scroll to the dividend section to ensure it's in view
if debug:
print("DEBUG: Scrolling to stock-dividends component...")
try:
await page.evaluate('''
() => {
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
stockDividends.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(3000)
if debug:
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_after_scroll.png", png)
print(f"DEBUG: Screenshot after scroll: {path}")
# Check again for dividend grid after scrolling
try:
await page.wait_for_selector('#dividend-grid', timeout=5000)
dividend_grid_found = True
if debug:
print("DEBUG: #dividend-grid found after scroll!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_grid_after_scroll.png", png)
print(f"DEBUG: Screenshot with grid after scroll: {path}")
except:
if debug:
print("DEBUG: #dividend-grid still not found after scroll")
except Exception as e:
if debug:
print(f"DEBUG: Error during scroll attempt: {e}")
# Common dividend section selectors used by financial websites
dividend_selectors = [
'#dividend-grid', # Primary target based on user feedback
'stock-dividends', # Secondary target - the web component
'#dividend-section',
'#dividends-section',
'.dividend-summary',
'.dividends-summary',
'div[data-testid*="dividend"]',
'div[aria-label*="dividend"]',
'[class*="dividend"]',
'section:has-text("Dividend")',
'div:has-text("Previous Dividend Payment")'
]
# Try to find dividend section
dividend_section = None
for selector in dividend_selectors:
try:
if await page.is_visible(selector):
dividend_section = selector
if debug:
print(f"DEBUG: Found dividend section with selector: {selector}")
break
except:
continue
if not dividend_section:
if debug:
print("DEBUG: No dividend section found, trying broader search...")
# In debug mode, capture the page content to help identify selectors
page_content = await page.content()
path_html = save_debug_artifact("debug_dividend_page.html", page_content)
print(f"DEBUG: Page HTML saved to {path_html} for analysis")
# Also save a screenshot to see the visual layout
png = await page.screenshot(full_page=True)
path_png = save_debug_artifact("debug_dividend_page.png", png)
print(f"DEBUG: Page screenshot saved to {path_png}")
# Fallback: look for dividend-related text anywhere on page
dividend_text_exists = await page.evaluate('''
() => {
const text = document.body.innerText.toLowerCase();
return text.includes('dividend') || text.includes('ex-date') || text.includes('pay date') || text.includes('previous dividend') || text.includes('iad');
}
''')
if debug:
print(f"DEBUG: Dividend-related text found on page: {dividend_text_exists}")
# Try scrolling down to reveal more content
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await page.wait_for_timeout(2000)
# Extract all text content that might contain dividend info
dividend_related_text = await page.evaluate('''
() => {
const text = document.body.innerText;
const lines = text.split('\n');
const dividendLines = lines.filter(line => {
const lower = line.toLowerCase();
return lower.includes('dividend') || lower.includes('ex-date') ||
lower.includes('pay date') || lower.includes('previous') ||
lower.includes('iad') || lower.includes('frequency') ||
lower.includes('quarterly') || lower.includes('$0.26') ||
lower.includes('0.4865%') || lower.includes('$1.04') ||
lower.includes('annual dividend') || lower.includes('yield');
});
return dividendLines;
}
''')
print(f"DEBUG: Found dividend-related text lines: {dividend_related_text}")
# Try a more comprehensive search for dividend data
all_dividend_info = await page.evaluate('''
() => {
// Look for elements containing common dividend field names
const fieldNames = [
'Previous Dividend Payment', 'Next Dividend Payment',
'Previous Pay Date', 'Next Pay Date',
'Previous Ex-Date', 'Next Ex-Date', 'Ex-Date',
'Frequency', 'Annual Dividend Rate', 'IAD',
'Annual Dividend Yield', 'Dividend Yield'
];
const results = {};
fieldNames.forEach(fieldName => {
// Search for elements containing this field name
const elements = Array.from(document.querySelectorAll('*')).filter(el =>
el.textContent && el.textContent.includes(fieldName) &&
el.children.length === 0 // Text nodes only
);
elements.forEach(el => {
// Look for value in nearby elements
const parent = el.parentElement;
if (parent) {
const siblings = Array.from(parent.children);
const currentIndex = siblings.indexOf(el);
// Check next siblings for values
for (let i = currentIndex + 1; i < siblings.length; i++) {
const sibling = siblings[i];
const text = sibling.textContent.trim();
if (text && text !== fieldName && text.length > 0 && text.length < 50) {
results[fieldName] = text;
break;
}
}
// Check same element for values after the field name
const fullText = el.textContent;
const fieldIndex = fullText.indexOf(fieldName);
if (fieldIndex >= 0) {
const afterField = fullText.substring(fieldIndex + fieldName.length).trim();
if (afterField && afterField.length > 0 && afterField.length < 50) {
results[fieldName] = afterField;
}
}
}
});
});
return results;
}
''')
print(f"DEBUG: Comprehensive dividend search results: {all_dividend_info}")
# If we found data in the comprehensive search, use it only if we don't already have good data
if all_dividend_info:
for field, value in all_dividend_info.items():
if value and value.strip():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value.strip()
if debug:
print(f"DEBUG: Added dividend field from comprehensive search: {field} = {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring comprehensive search value: {value})")
if not dividend_text_exists:
if debug:
print("DEBUG: No dividend-related content found on page")
return dividend_data
# Use body as fallback section for broad search
dividend_section = 'body'
if debug:
print("DEBUG: Using body as dividend section for broad search")
# If we found the dividend grid, use specific selectors based on user feedback
if dividend_section == '#dividend-grid':
if debug:
print("DEBUG: Using specific dividend grid selectors...")
try:
# First check if dividend grid is actually present and populated
grid_status = await page.evaluate('''
() => {
const dividendGrid = document.querySelector('#dividend-grid');
if (!dividendGrid) return { found: false, message: 'No #dividend-grid element found' };
const textContent = dividendGrid.textContent || '';
const hasContent = textContent.trim().length > 50;
const childCount = dividendGrid.children.length;
return {
found: true,
hasContent,
textLength: textContent.length,
childCount,
preview: textContent.substring(0, 200),
message: `Grid found with ${childCount} children, ${textContent.length} chars`
};
}
''')
if debug:
print(f"DEBUG: Dividend grid status: {grid_status}")
# Extract dividend data using improved selectors
specific_dividend_data = await page.evaluate(r'''
() => {
const results = {};
// Check if dividend grid exists and has content
const dividendGrid = document.querySelector('#dividend-grid');
if (dividendGrid) {
const allGridText = dividendGrid.textContent || '';
const lines = allGridText.split('\n').map(line => line.trim()).filter(line => line.length > 0);
// Try structured approach first - look for rows/cells
const dividendRows = dividendGrid.querySelectorAll('div[class*="row"], tr, .dividend-row, div:has(div)');
dividendRows.forEach((row, rowIndex) => {
const rowText = row.textContent || '';
// Look for dividend payment info
if (rowText.includes('Dividend Payment') || (rowText.includes('Previous') && rowText.includes('$'))) {
const amountMatch = rowText.match(/\$[0-9]+\.[0-9]+/);
if (amountMatch && !results['Previous Dividend Payment']) {
results['Previous Dividend Payment'] = amountMatch[0];
}
// Look for dates in the same row
const dateMatches = rowText.match(/([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/g);
if (dateMatches) {
if (dateMatches.length >= 1 && !results['Previous Pay Date']) results['Previous Pay Date'] = dateMatches[0];
if (dateMatches.length >= 2 && !results['Previous Ex-Date']) results['Previous Ex-Date'] = dateMatches[1];
}
}
});
// Fallback: Parse all lines systematically
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = i + 1 < lines.length ? lines[i + 1] : '';
// Match dividend payment
if ((line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) && !results['Previous Dividend Payment']) {
const amountPattern = /\$[0-9]+\.[0-9]+/;
let amount = line.match(amountPattern) || nextLine.match(amountPattern);
if (amount) results['Previous Dividend Payment'] = amount[0];
}
// Match pay date
if (line.includes('Pay Date') && !results['Previous Pay Date']) {
const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/;
let date = line.match(datePattern) || nextLine.match(datePattern);
if (date) results['Previous Pay Date'] = date[0];
}
// Match ex-date
if (line.includes('Ex-Date') && !results['Previous Ex-Date']) {
const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/;
let date = line.match(datePattern) || nextLine.match(datePattern);
if (date) results['Previous Ex-Date'] = date[0];
}
// Match frequency
if (line.includes('Frequency') && !results['Frequency']) {
const freqLine = line + ' ' + nextLine;
if (freqLine.toLowerCase().includes('quarterly')) results['Frequency'] = 'Quarterly';
else if (freqLine.toLowerCase().includes('monthly')) results['Frequency'] = 'Monthly';
else if (freqLine.toLowerCase().includes('annual')) results['Frequency'] = 'Annual';
else if (freqLine.toLowerCase().includes('semi')) results['Frequency'] = 'Semi-Annual';
}
// Match annual dividend rate
if ((line.includes('Annual Dividend Rate') || line.includes('IAD')) && !results['Annual Dividend Rate']) {
const amountPattern = /\$[0-9]+\.[0-9]+/;
let amount = line.match(amountPattern) || nextLine.match(amountPattern);
if (amount) results['Annual Dividend Rate'] = amount[0];
}
// Match annual dividend yield
if (line.includes('Annual Dividend Yield') && !results['Annual Dividend Yield']) {
const percentPattern = /[0-9]+\.[0-9]+%/;
let percent = line.match(percentPattern) || nextLine.match(percentPattern);
if (percent) results['Annual Dividend Yield'] = percent[0];
}
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Specific dividend grid extraction results: {specific_dividend_data}")
# Add the extracted data to dividend_data only if we don't already have good data
if specific_dividend_data:
for field, value in specific_dividend_data.items():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value
if debug:
print(f"DEBUG: Updated {field} from specific extraction: {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring specific extraction value: {value})")
except Exception as e:
if debug:
print(f"DEBUG: Error in specific dividend grid extraction: {e}")
# Extract dividend data using the correct structure from gemini analysis
if debug:
print("DEBUG: Extracting dividend data from dividend-grid structure...")
# First try to extract data from the dynamically loaded dividend content
try:
dividend_dynamic_data = await page.evaluate(r'''
() => {
const results = {};
// Strategy 1: Look for any dividend grid structure that was loaded
const dividendGrid = document.querySelector('#dividend-grid');
if (dividendGrid) {
const rows = dividendGrid.querySelectorAll('div.sdps-row, .row');
for (let row of rows) {
const cells = row.querySelectorAll('div[class*="col-"]');
if (cells.length >= 2) {
const label = cells[0].textContent.trim();
const value = cells[1].textContent.trim();
// Map the labels to our expected field names
if (label.includes('Previous Dividend Payment') || label.includes('Dividend Payment')) {
results['Previous Dividend Payment'] = value;
} else if (label.includes('Previous Pay Date') || label.includes('Pay Date')) {
results['Previous Pay Date'] = value;
} else if (label.includes('Previous Ex-Date') || label.includes('Ex-Date')) {
results['Previous Ex-Date'] = value;
} else if (label.includes('Frequency')) {
results['Frequency'] = value;
} else if (label.includes('Annual Dividend Rate') || label.includes('IAD')) {
results['Annual Dividend Rate'] = value;
} else if (label.includes('Annual Dividend Yield')) {
results['Annual Dividend Yield'] = value;
}
}
}
if (Object.keys(results).length > 0) {
return results;
}
}
// Strategy 2: Look for stock-dividends component content
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
const allText = stockDividends.textContent || '';
const lines = allText.split('\n').map(line => line.trim()).filter(line => line);
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = i + 1 < lines.length ? lines[i + 1] : '';
if (line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) {
const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) results['Previous Dividend Payment'] = amountMatch[0];
} else if (line.includes('Pay Date')) {
const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/);
if (dateMatch) results['Previous Pay Date'] = dateMatch[0];
} else if (line.includes('Ex-Date')) {
const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/);
if (dateMatch) results['Previous Ex-Date'] = dateMatch[0];
} else if (line.includes('Frequency')) {
if (line.toLowerCase().includes('quarterly') || nextLine.toLowerCase().includes('quarterly')) {
results['Frequency'] = 'Quarterly';
} else if (line.toLowerCase().includes('monthly') || nextLine.toLowerCase().includes('monthly')) {
results['Frequency'] = 'Monthly';
} else if (line.toLowerCase().includes('annual') || nextLine.toLowerCase().includes('annual')) {
results['Frequency'] = 'Annual';
}
} else if (line.includes('Annual Dividend Rate') || line.includes('IAD')) {
const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) results['Annual Dividend Rate'] = amountMatch[0];
} else if (line.includes('Annual Dividend Yield')) {
const percentMatch = (line + ' ' + nextLine).match(/[0-9]+\.[0-9]+%/);
if (percentMatch) results['Annual Dividend Yield'] = percentMatch[0];
}
}
if (Object.keys(results).length > 0) {
return results;
}
}
// Strategy 3: Look within entire dividends panel for any structured content
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const allElements = dividendsPanel.querySelectorAll('*');
for (let elem of allElements) {
const text = elem.textContent || '';
// Look for dollar amounts near dividend-related text
if (text.includes('Previous Dividend Payment') || text.includes('Dividend Payment')) {
const parent = elem.parentElement;
if (parent) {
const siblings = Array.from(parent.children);
const currentIndex = siblings.indexOf(elem);
// Check next siblings for values
for (let j = currentIndex + 1; j < siblings.length; j++) {
const sibling = siblings[j];
const siblingText = sibling.textContent.trim();
const amountMatch = siblingText.match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) {
results['Previous Dividend Payment'] = amountMatch[0];
break;
}
}
}
}
// Similar logic for other fields...
// (truncated for brevity but would include Pay Date, Ex-Date, etc.)
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Dynamic dividend extraction results: {dividend_dynamic_data}")
if dividend_dynamic_data:
for field, value in dividend_dynamic_data.items():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value
if debug:
print(f"DEBUG: Updated {field} from dynamic extraction: {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring dynamic extraction value: {value})")
except Exception as e:
if debug:
print(f"DEBUG: Error in dynamic dividend extraction: {e}")
# Define dividend fields and their possible selectors as fallback
dividend_fields = {
'Previous Dividend Payment': [
'#dividend-grid div:has-text("Previous Dividend Payment") ~ div',
'#dividend-grid div:has-text("Dividend Payment") ~ div',
'#dividends span:has-text("Previous Dividend Payment") + span',
'#dividends div:has-text("Previous Dividend Payment") + div',
'#dividends *:has-text("Previous Dividend Payment") ~ *',
'stock-dividends span:has-text("Previous Dividend Payment") + span',
'stock-dividends div:has-text("Previous Dividend Payment") + div',
'span:has-text("Previous Dividend Payment") + span',
'div:has-text("Previous Dividend Payment") + div',
'*:has-text("Previous Dividend Payment") ~ *',
'span:has-text("Next Dividend Payment") + span',
'div:has-text("Next Dividend Payment") + div',
'*:has-text("Next Dividend Payment") ~ *',
'[data-field="dividend-payment"]',
'.dividend-payment'
],
'Previous Pay Date': [
'#dividend-grid div:has-text("Previous Pay Date") ~ div',
'#dividend-grid div:has-text("Pay Date") ~ div',
'#dividends span:has-text("Previous Pay Date") + span',
'#dividends div:has-text("Previous Pay Date") + div',
'#dividends *:has-text("Previous Pay Date") ~ *',
'stock-dividends span:has-text("Previous Pay Date") + span',
'stock-dividends div:has-text("Previous Pay Date") + div',
'span:has-text("Previous Pay Date") + span',
'div:has-text("Previous Pay Date") + div',
'*:has-text("Previous Pay Date") ~ *',
'span:has-text("Next Pay Date") + span',
'div:has-text("Next Pay Date") + div',
'*:has-text("Next Pay Date") ~ *',
'*:has-text("Pay Date") ~ *',
'[data-field="pay-date"]',
'.pay-date'
],
'Previous Ex-Date': [
'#dividend-grid div:has-text("Previous Ex-Date") ~ div',
'#dividend-grid div:has-text("Ex-Date") ~ div',
'#dividends span:has-text("Previous Ex-Date") + span',
'#dividends div:has-text("Previous Ex-Date") + div',
'#dividends *:has-text("Previous Ex-Date") ~ *',
'stock-dividends span:has-text("Previous Ex-Date") + span',
'stock-dividends div:has-text("Previous Ex-Date") + div',
'span:has-text("Previous Ex-Date") + span',
'div:has-text("Previous Ex-Date") + div',
'*:has-text("Previous Ex-Date") ~ *',
'span:has-text("Next Ex-Date") + span',
'div:has-text("Next Ex-Date") + div',
'*:has-text("Next Ex-Date") ~ *',
'*:has-text("Ex-Date") ~ *',
'[data-field="ex-date"]',
'.ex-date'
],
'Frequency': [
'#dividend-grid div:has-text("Frequency") ~ div',
'#dividends span:has-text("Frequency") + span',
'#dividends div:has-text("Frequency") + div',
'#dividends *:has-text("Frequency") ~ *',
'stock-dividends span:has-text("Frequency") + span',
'stock-dividends div:has-text("Frequency") + div',
'span:has-text("Frequency") + span',
'div:has-text("Frequency") + div',
'*:has-text("Frequency") ~ *',
'[data-field="frequency"]',
'.dividend-frequency',
'.frequency'
],
'Annual Dividend Rate': [
'#dividend-grid div:has-text("Annual Dividend Rate") ~ div',
'#dividend-grid div:has-text("IAD") ~ div',
'#dividends span:has-text("Annual Dividend Rate") + span',
'#dividends div:has-text("Annual Dividend Rate") + div',
'#dividends *:has-text("Annual Dividend Rate") ~ *',
'#dividends span:has-text("IAD") + span',
'#dividends *:has-text("IAD") ~ *',
'stock-dividends span:has-text("Annual Dividend Rate") + span',
'stock-dividends div:has-text("Annual Dividend Rate") + div',
'stock-dividends span:has-text("IAD") + span',
'span:has-text("Annual Dividend Rate") + span',
'div:has-text("Annual Dividend Rate") + div',
'*:has-text("Annual Dividend Rate") ~ *',
'span:has-text("IAD") + span',
'*:has-text("IAD") ~ *',
'[data-field="annual-rate"]',
'.annual-dividend-rate'
],
'Annual Dividend Yield': [
'#dividend-grid div:has-text("Annual Dividend Yield") ~ div',
'#dividends span:has-text("Annual Dividend Yield") + span',
'#dividends div:has-text("Annual Dividend Yield") + div',
'#dividends *:has-text("Annual Dividend Yield") ~ *',
'stock-dividends span:has-text("Annual Dividend Yield") + span',
'stock-dividends div:has-text("Annual Dividend Yield") + div',
'span:has-text("Annual Dividend Yield") + span',
'div:has-text("Annual Dividend Yield") + div',
'*:has-text("Annual Dividend Yield") ~ *',
'[data-field="dividend-yield"]',
'.dividend-yield'
]
}
# Extract each dividend field using multiple selector strategies
for field_name, selectors in dividend_fields.items():
field_found = False
# Try each selector for this field
for selector in selectors:
if field_found:
break
try:
# Scope search within dividend section if found, otherwise search whole page
full_selector = f'{dividend_section} {selector}' if dividend_section != 'body' else selector
if await page.is_visible(full_selector, timeout=1000):
value = await page.inner_text(full_selector)
clean_value = value.strip()
if clean_value and clean_value != field_name: # Ensure we got actual value, not the label
existing_value = dividend_data.get(field_name, '')
if should_replace_dividend_value(existing_value, clean_value):
dividend_data[field_name] = clean_value
field_found = True
if debug:
print(f"DEBUG: Found {field_name}: {clean_value} (selector: {full_selector})")
elif debug:
print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring selector-based value: {clean_value})")
break
except:
continue
# If standard selectors failed, try JavaScript-based text search as fallback
if not field_found:
try:
# Try multiple variations of the field name
search_terms = [field_name]
if "Previous" in field_name:
search_terms.append(field_name.replace("Previous", "Next"))
if "Annual Dividend Rate" in field_name:
search_terms.append("IAD")
if "Annual Dividend Yield" in field_name:
search_terms.append("Dividend Yield")
for search_term in search_terms:
if field_found:
break
value = await page.evaluate(rf'''
() => {{
const searchText = "{search_term}";
// First check within the dividends section specifically
const dividendsPanel = document.querySelector('#dividends');
const stockDividends = document.querySelector('stock-dividends');
const searchContainers = [dividendsPanel, stockDividends, document];
for (let container of searchContainers) {{
if (!container) continue;
const elements = Array.from(container.querySelectorAll('*'));
for (let elem of elements) {{
if (elem.textContent && elem.textContent.includes(searchText)) {{
// Look for next sibling or nearby element with value
let candidate = elem.nextElementSibling;
if (candidate && candidate.textContent &&
!candidate.textContent.includes(searchText) &&
candidate.textContent.trim().length > 0) {{
return candidate.textContent.trim();
}}
// Try parent's next sibling
candidate = elem.parentElement?.nextElementSibling;
if (candidate && candidate.textContent &&
!candidate.textContent.includes(searchText) &&
candidate.textContent.trim().length > 0) {{
return candidate.textContent.trim();
}}
// Try looking in the same element's parent for nearby text
const parent = elem.parentElement;
if (parent) {{
const parentText = parent.textContent;
const lines = parentText.split('\n');
for (let i = 0; i < lines.length; i++) {{
if (lines[i].includes(searchText) && i + 1 < lines.length) {{
const nextLine = lines[i + 1].trim();
if (nextLine && !nextLine.includes(searchText)) {{
return nextLine;
}}
}}
}}
}}
}}
}}
// If found in this container, stop searching
if (container !== document) {{
break;
}}
}}
return null;
}}
''')
if value and value.strip():
existing_value = dividend_data.get(field_name, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field_name] = value.strip()
field_found = True
if debug:
print(f"DEBUG: Found {field_name} via JS search with term '{search_term}': {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring JS search value: {value})")
break
except Exception as e:
if debug:
print(f"DEBUG: Could not find {field_name}: {e}")
continue
if debug:
print(f"DEBUG: Extracted dividend data: {dividend_data}")
return dividend_data
except Exception as e:
if debug:
print(f"DEBUG: Error extracting dividend data: {e}")
return dividend_data
async def extract(page, debug: bool = False) -> Dict[str, Any]:
"""Compatibility wrapper to call `extract_dividend_data`"""
return await extract_dividend_data(page, debug=debug)