Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s

This commit is contained in:
2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions

View File

@@ -0,0 +1,239 @@
from typing import Optional, Tuple
import logging
async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]:
"""Locate the Morningstar Equity Report link and date on the stock page.
Uses multiple fallback strategies to handle Schwab website changes.
Returns:
Tuple of (url, date) where:
- url: The href attribute if it's a traditional link, or a special marker
'__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking
- date: The report date string if found
"""
logger = logging.getLogger(__name__)
# Strategy 1: Original selector
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
if await page.is_visible(report_link_selector):
if debug:
logger.debug("Found Morningstar report using original selector")
report_link_element = page.locator(report_link_selector)
await report_link_element.scroll_into_view_if_needed()
url = await report_link_element.get_attribute("href")
# Date element (escaped spaces)
date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)")
date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None
# Check if href is empty (modern web component using blob URLs)
if not url or url == '':
if debug:
logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click")
# Return a special marker to indicate we need to click the link to get the URL
return '__CLICK_TO_OPEN__', date_text
return url, date_text
# Strategy 2: Look for any link containing "morningstar" in research section
if debug:
logger.debug("Original selector failed, trying fallback selectors...")
fallback_selectors = [
"a.sr-report-link[href*='morningstar']",
"a[href*='morningstar'][href*='pdf']",
"#morningstar-section a.sr-report-link",
"div[id*='Morningstar'] a",
]
for selector in fallback_selectors:
try:
if await page.is_visible(selector, timeout=2000):
if debug:
logger.debug(f"Found Morningstar report using fallback selector: {selector}")
report_link_element = page.locator(selector).first
await report_link_element.scroll_into_view_if_needed()
url = await report_link_element.get_attribute("href")
# Try to find date with various selectors
date_text = None
date_selectors = [
r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)",
"sdps-date-time time span",
"time span",
]
for date_sel in date_selectors:
try:
date_locator = page.locator(date_sel)
if await date_locator.count() > 0:
date_text = (await date_locator.first.inner_text()).strip()
if date_text:
break
except:
continue
return url, date_text
except Exception as e:
if debug:
logger.debug(f"Fallback selector {selector} failed: {e}")
continue
# Strategy 3: Use JavaScript to search for Morningstar links
if debug:
logger.debug("All CSS selectors failed, trying JavaScript search...")
try:
result = await page.evaluate("""
() => {
// Look for any link containing 'morningstar' and 'pdf'
const links = Array.from(document.querySelectorAll('a[href]'));
const morningstarLink = links.find(link =>
link.href.toLowerCase().includes('morningstar') &&
link.href.toLowerCase().includes('pdf')
);
if (morningstarLink) {
// Try to find associated date
let dateText = null;
const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement;
if (parent) {
const timeElement = parent.querySelector('time');
if (timeElement) {
dateText = timeElement.textContent.trim();
}
}
return {
url: morningstarLink.href,
date: dateText
};
}
return null;
}
""")
if result and result.get('url'):
if debug:
logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}")
return result['url'], result.get('date')
except Exception as e:
if debug:
logger.debug(f"JavaScript search failed: {e}")
# No report found
if debug:
logger.debug("No Morningstar report link found using any strategy")
# Capture page state for debugging
try:
await page.screenshot(path="debug_morningstar_not_found.png", full_page=True)
logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png")
# Log available elements for debugging
page_info = await page.evaluate("""
() => {
return {
hasMorningstarSection: !!document.querySelector('#morningstar-section'),
hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'),
morningstarLinks: Array.from(document.querySelectorAll('a[href]'))
.filter(a => a.href.toLowerCase().includes('morningstar'))
.length,
allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length
}
}
""")
logger.debug(f"Page state: {page_info}")
except Exception as e:
logger.debug(f"Failed to capture debug info: {e}")
return None, None
async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]:
"""Open the PDF in a new page and return bytes via data URL conversion.
Args:
page: The current Playwright page
url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs
debug: Enable debug logging
Returns:
PDF bytes if successful, None otherwise
"""
logger = logging.getLogger(__name__)
if not url:
return None
# Handle blob URL case (modern web component)
if url == '__CLICK_TO_OPEN__':
if debug:
logger.debug("Handling blob URL - clicking link to open PDF")
# Click the Morningstar report link to open the PDF
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
try:
# Wait for new page to open after clicking
new_page_promise = page.context.wait_for_event("page", timeout=15000)
await page.click(report_link_selector)
new_page = await new_page_promise
if debug:
logger.debug(f"New page opened with URL: {new_page.url}")
# Wait for PDF to load
await new_page.wait_for_load_state('load', timeout=10000)
# The PDF is now loaded as a blob URL - extract it
blob_url = new_page.url
except Exception as e:
if debug:
logger.debug(f"Error clicking link to open PDF: {e}")
return None
else:
# Traditional URL case
if debug:
logger.debug(f"Opening PDF from traditional URL: {url}")
new_page_promise = page.context.wait_for_event("page")
await page.evaluate("url => window.open(url, '_blank')", url)
new_page = await new_page_promise
await new_page.wait_for_load_state('load')
blob_url = url
# Fetch and convert to Base64 in browser context
try:
pdf_base64 = await new_page.evaluate(
"""
async (url) => {
const response = await fetch(url);
const blob = await response.blob();
return await new Promise((resolve) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result.split(',')[1]);
reader.readAsDataURL(blob);
});
}
""",
blob_url,
)
await new_page.close()
if not pdf_base64:
return None
import base64
return base64.b64decode(pdf_base64)
except Exception as e:
if debug:
logger.debug(f"Error extracting PDF bytes: {e}")
try:
await new_page.close()
except:
pass
return None

View File

@@ -0,0 +1,80 @@
import re
from io import BytesIO
from typing import Dict
import pdfplumber
def clean_value(label: str, value: str) -> str:
"""Cleans the extracted value based on the label."""
if label == "Morningstar Rating":
return f"{value.count('Q')} stars"
if label == "Economic Moat":
if "Wide" in value:
return "Wide"
if "Narrow" in value:
return "Narrow"
if "None" in value:
return "None"
if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
match = re.match(r"[\d,]+\.\d{2}", value)
if match:
return match.group(0)
if label == "Assessment":
return value.split()[0]
if label == "52-Week-Range":
return value.replace('\u2014', '-')
if label == "52-Week Range":
return value.replace('\u2014', '-')
return value
def parse(pdf_content: bytes) -> Dict[str, str]:
"""
Parses a Morningstar PDF report to extract key data points.
Returns a dict keyed by the label names present in the report.
"""
with pdfplumber.open(BytesIO(pdf_content)) as pdf:
page = pdf.pages[2] # Page 3
words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
data: Dict[str, str] = {}
labels = [
"Fair Value", "1-Star Price", "5-Star Price", "Assessment",
"Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
"Economic Moat", "Morningstar Rating"
]
for i, word in enumerate(words):
# Combine words to form potential labels
for j in range(i + 1, min(i + 4, len(words))):
potential_label = " ".join(w['text'] for w in words[i:j])
if potential_label in labels:
if potential_label == "Economic Moat":
# Find the value to the right of the label
label_end_x = words[j-1]['x1']
value_words = [
w['text'] for w in words[j:]
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
]
if value_words:
value = " ".join(value_words)
if "Wide" in value:
data[potential_label] = "Wide"
elif "Narrow" in value:
data[potential_label] = "Narrow"
elif "None" in value:
data[potential_label] = "None"
break
else:
# Find the value to the right of the label
label_end_x = words[j-1]['x1']
value_words = [
w['text'] for w in words[j:]
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
]
if value_words:
# Join the value words and clean them
value = " ".join(value_words)
data[potential_label] = clean_value(potential_label, value)
break # Move to the next word once a label is found
return data

View File

@@ -0,0 +1,490 @@
"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL)
⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️
This module was an attempt to extract equity data by calling Schwab's REST APIs directly.
While the APIs exist and were discovered via HAR analysis, they are NOT accessible from
this scraper due to fundamental browser security limitations (CORS).
## Why This Approach Failed:
1. **CORS (Cross-Origin Resource Sharing) Restrictions**:
- Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins)
- Browser blocks cross-origin fetch() calls even from page.evaluate()
- Results in "TypeError: Failed to fetch"
2. **Authentication Complexity**:
- Direct HTTP (aiohttp) with cookies: 401/403 errors
- Playwright page.request.fetch(): 401 errors (separate context)
- Likely requires dynamic tokens beyond cookies
## Recommendation:
**Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated
sessions and extracts all Phase 1 fields without CORS limitations.
## API Endpoints (discovered but inaccessible):
- Quote: /api/is.ResearchExperience/v1/quote
- Dividends: /api/is.ResearchExperience/v1/events/dividends
- Earnings: /api/is.ResearchExperience/v1/events/earnings
- Share Profile: /api/is.ResearchExperience/v1/shareprofile
"""
from typing import Dict, Any, Optional, List
import logging
import uuid
import aiohttp
from playwright.async_api import Page
from ...core import (
QuoteData, EnhancedDividends, EarningsData,
CalculatedMetrics, EquityPhase1Data
)
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float."""
if value is None:
return None
try:
if isinstance(value, str):
# Remove % sign if present
value = value.replace('%', '').strip()
return float(value)
except (ValueError, TypeError):
return None
def _parse_market_cap(value: str) -> Optional[str]:
"""Parse market cap string like '$3.03T' or '$462.11B'."""
if not value:
return None
# Keep the formatted string as-is for readability
return value.strip()
def _parse_volume(value: Any) -> Optional[int]:
"""Parse volume value."""
if value is None:
return None
try:
return int(float(value))
except (ValueError, TypeError):
return None
def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData:
"""Parse quote API response into QuoteData object.
API Response Structure:
{
"reference": {
"symbol": "JNJ",
"companyName": "JOHNSON & JOHNSON",
"exchangeName": "NYSE"
},
"quote": {
"lastPrice": 193.155,
"netChange": 1.275,
"netChangePercent": 0.6644778,
"postMarketChange": 0.0,
"postMarketPercentChange": 0.0,
"tradeTime": "2025-10-22T17:06:42.008Z"
},
"regularQuote": {
"lastPrice": 193.155,
"lastSize": 100.0,
"netChange": 1.275,
"percentChange": 0.6644778,
...
}
}
"""
quote = QuoteData()
try:
reference = data.get('reference', {})
quote_data = data.get('quote', {})
regular_quote = data.get('regularQuote', {})
# Basic info
quote.exchange = reference.get('exchangeName')
# Price data
quote.price = _parse_float(quote_data.get('lastPrice'))
quote.change = _parse_float(quote_data.get('netChange'))
quote.change_percent = _parse_float(quote_data.get('netChangePercent'))
# After hours (post market)
quote.after_hours_change = _parse_float(quote_data.get('postMarketChange'))
quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange'))
# Extended quote data
quote.previous_close = _parse_float(regular_quote.get('closePrice'))
quote.open = _parse_float(regular_quote.get('openPrice'))
quote.bid = _parse_float(regular_quote.get('bidPrice'))
quote.ask = _parse_float(regular_quote.get('askPrice'))
quote.volume = _parse_volume(regular_quote.get('totalVolume'))
quote.day_range_low = _parse_float(regular_quote.get('lowPrice'))
quote.day_range_high = _parse_float(regular_quote.get('highPrice'))
quote.week_52_low = _parse_float(regular_quote.get('priceLow52W'))
quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W'))
# Bid/Ask size
bid_size = regular_quote.get('bidSize', 0)
ask_size = regular_quote.get('askSize', 0)
if bid_size or ask_size:
quote.bid_ask_size = f"{bid_size}/{ask_size}"
# Volume vs average
avg_volume_label = regular_quote.get('averageVolumeDaily')
if avg_volume_label:
quote.volume_vs_avg = avg_volume_label
except Exception as e:
logger.debug(f"Error parsing quote API response: {e}")
return quote
def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends:
"""Parse dividends API response into EnhancedDividends object.
API Response Structure:
{
"symbol": "JNJ",
"currentAnnualDividendMethod": "IAD",
"status": "DIVIDENDS_PAID_CURRENTLY",
"dividends": [
{
"dividendPayment": 1.3,
"dividendPayDate": "December 09, 2025",
"dividendExDate": "November 25, 2025",
"dividendFrequency": "Quarterly",
"annualDividendRate": 5.2,
"dividendYield": "2.71%"
},
...
]
}
"""
dividends = EnhancedDividends()
try:
dividend_list = data.get('dividends', [])
if not dividend_list:
return dividends
# Most recent dividend is first
latest = dividend_list[0]
# Next/upcoming dividend data
dividends.next_payment = _parse_float(latest.get('dividendPayment'))
dividends.next_pay_date = latest.get('dividendPayDate')
dividends.next_ex_date = latest.get('dividendExDate')
dividends.frequency = latest.get('dividendFrequency')
dividends.annual_rate = _parse_float(latest.get('annualDividendRate'))
dividends.annual_yield = _parse_float(latest.get('dividendYield'))
# Previous dividend (if there's more than one in history)
if len(dividend_list) > 1:
previous = dividend_list[1]
dividends.previous_payment = _parse_float(previous.get('dividendPayment'))
dividends.previous_pay_date = previous.get('dividendPayDate')
dividends.previous_ex_date = previous.get('dividendExDate')
except Exception as e:
logger.debug(f"Error parsing dividends API response: {e}")
return dividends
def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData:
"""Parse earnings API response into EarningsData object.
API Response Structure:
{
"symbol": "GOOGL",
"fundamentals": {},
"upcoming": {
"earningsDate": "10/29/2025",
"numberOfAnalysts": 43,
"epsNonGaapEstimate": 2.18
},
"historical": [
{
"epsGaapActual": 2.31,
"epsNonGaapActual": 2.31,
"earningsDate": "07/23/2025",
"numberOfAnalysts": 43,
"epsNonGaapEstimate": 2.18,
"epsNonGaapEstimateHigh": 2.42,
"epsNonGaapEstimateLow": 2.0
}
]
}
"""
earnings = EarningsData()
try:
upcoming = data.get('upcoming', {})
historical = data.get('historical', [])
fundamentals = data.get('fundamentals', {})
# Upcoming earnings
if upcoming:
earnings.next_announcement_date = upcoming.get('earningsDate')
earnings.announcement_timing = upcoming.get('announcementTiming')
earnings.analysts_covering = upcoming.get('numberOfAnalysts')
earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate'))
earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh'))
earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow'))
# Historical earnings (most recent)
if historical:
latest = historical[0]
earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual'))
# If we don't have upcoming, use latest historical for analyst data
if not upcoming:
earnings.analysts_covering = latest.get('numberOfAnalysts')
earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate'))
earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh'))
earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow'))
# Beat/miss information
beat_amount = latest.get('epsNonGaapBeat')
if beat_amount is not None:
earnings.recent_beats = [{
'beat_amount': _parse_float(beat_amount),
'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')),
'date': latest.get('earningsDate')
}]
# Fundamentals (PE ratios, revenue)
if fundamentals:
earnings.pe_ttm = _parse_float(fundamentals.get('peRatio'))
earnings.forward_pe = _parse_float(fundamentals.get('forwardPE'))
earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio'))
earnings.revenue_ttm = _parse_float(fundamentals.get('revenue'))
except Exception as e:
logger.debug(f"Error parsing earnings API response: {e}")
return earnings
def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData:
"""Parse share profile API response and enhance QuoteData with market cap, etc.
API Response Structure:
{
"companySummary": {
"marketCapLabel": "Large Cap",
"marketCapValue": "$462.11B",
"companyEnterpriseValue": "$462.11B"
},
"shareInfo": [{
"sharesOutstanding": "2.41B",
"sharesHeld": "71.29%"
}]
}
"""
try:
company_summary = data.get('companySummary', {})
# Market cap
quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue'))
# Sector info might be in other fields
# Note: Sector information may not be in shareprofile API
# It might be in securityprofiles or other endpoints
except Exception as e:
logger.debug(f"Error parsing share profile API response: {e}")
return quote
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]:
"""Call a Schwab API endpoint from within the browser's JavaScript context.
This uses page.evaluate() to run fetch() directly in the browser, which ensures
all cookies, authentication tokens, and session state are automatically included.
This is the most reliable way to call Schwab APIs.
Args:
page: Playwright page with authenticated session
url: API endpoint URL
debug: Enable debug logging
Returns:
Parsed JSON response or None on error
"""
try:
if debug:
logger.debug(f"Calling API: {url}")
# Generate correlation IDs
correlator_id = str(uuid.uuid4())
client_correlid = str(uuid.uuid4())
# Call API from within browser's JavaScript context using fetch()
# This automatically includes all cookies and session state
result = await page.evaluate("""
async ({url, correlatorId, clientCorrelId}) => {
try {
const response = await fetch(url, {
method: 'GET',
credentials: 'include', // Include cookies
headers: {
'accept': 'application/json',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'correlatorid': correlatorId,
'pragma': 'no-cache',
'schwab-client-appid': 'AD00007800',
'schwab-client-channel': 'IO',
'schwab-client-correlid': clientCorrelId,
'schwab-resource-version': '2',
}
});
if (!response.ok) {
const errorText = await response.text();
return {
success: false,
status: response.status,
error: errorText
};
}
const data = await response.json();
return {
success: true,
status: response.status,
data: data
};
} catch (error) {
return {
success: false,
error: error.toString()
};
}
}
""", {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid})
if not result.get('success'):
if debug:
status = result.get('status', 'unknown')
error = result.get('error', 'unknown error')
logger.debug(f"API returned status {status}: {str(error)[:200]}")
return None
data = result.get('data')
if debug and data:
logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}")
return data
except Exception as e:
if debug:
logger.debug(f"Error calling API {url}: {e}")
return None
async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data:
"""Extract Phase 1 data using Schwab's REST APIs.
This is the API-based replacement for the DOM scraping approach.
It calls Schwab's APIs directly using the authenticated session.
Args:
page: Playwright page with authenticated session
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
EquityPhase1Data with all extracted fields
"""
if debug:
logger.debug(f"Starting API-based Phase 1 extraction for {ticker}")
base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1"
# Build API URLs
quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true"
dividends_url = f"{base_url}/events/dividends?symbol={ticker}"
earnings_url = f"{base_url}/events/earnings?symbols={ticker}"
profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true"
# Make API calls using Playwright's request context (includes cookies automatically)
quote_data = await call_schwab_api(page, quote_url, debug)
dividends_data = await call_schwab_api(page, dividends_url, debug)
earnings_data = await call_schwab_api(page, earnings_url, debug)
profile_data = await call_schwab_api(page, profile_url, debug)
# Parse responses
# Quote API returns a list, get first item
if quote_data and isinstance(quote_data, list) and len(quote_data) > 0:
quote = parse_quote_api_response(quote_data[0])
elif quote_data and isinstance(quote_data, dict):
quote = parse_quote_api_response(quote_data)
else:
quote = QuoteData()
# Enhance quote with share profile data
if profile_data:
quote = parse_shareprofile_api_response(profile_data, quote)
# Parse dividends
dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends()
# Parse earnings
earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData()
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"API-based Phase 1 extraction complete for {ticker}")
# Count populated fields (dataclasses with slots don't have __dict__)
from dataclasses import fields as dataclass_fields
quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None)
div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None)
earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, []))
logger.debug(f" Quote fields populated: {quote_count}/21")
logger.debug(f" Dividend fields populated: {div_count}/9")
logger.debug(f" Earnings fields populated: {earn_count}/13")
return phase1_data

View File

@@ -0,0 +1,786 @@
"""Phase 1: Essential Dividend Metrics Implementation (DEPRECATED)
⚠️ DEPRECATED: This DOM-scraping based approach has been replaced by phase1_api_scraper.py
which uses Schwab's REST APIs directly. The API approach is more reliable, complete,
and maintainable than DOM scraping.
This module is kept for reference only. New code should use phase1_api_scraper.py.
Old approach extracts from DOM:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
"""
from typing import Dict, Any, Optional
import re
import logging
from ...core import QuoteData, EnhancedDividends, EarningsData, CalculatedMetrics, EquityPhase1Data
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float, handling $ and % symbols."""
if value is None:
return None
try:
# Remove common formatting characters
clean = str(value).strip().replace('$', '').replace(',', '').replace('%', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return float(clean)
except (ValueError, AttributeError):
pass
return None
def _parse_int(value: Any) -> Optional[int]:
"""Safely parse a value to int."""
if value is None:
return None
try:
clean = str(value).strip().replace(',', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return int(float(clean))
except (ValueError, AttributeError):
pass
return None
def _parse_volume(volume_str: str) -> Optional[int]:
"""Parse volume string like '8M', '22.4M', '1.2B' to integer."""
if not volume_str:
return None
try:
volume_str = volume_str.strip().upper()
multiplier = 1
if volume_str.endswith('K'):
multiplier = 1_000
volume_str = volume_str[:-1]
elif volume_str.endswith('M'):
multiplier = 1_000_000
volume_str = volume_str[:-1]
elif volume_str.endswith('B'):
multiplier = 1_000_000_000
volume_str = volume_str[:-1]
value = float(volume_str)
return int(value * multiplier)
except (ValueError, AttributeError):
return None
def _parse_revenue(revenue_str: str) -> Optional[float]:
"""Parse revenue string like '$92.15B', '$1.5M' to dollar value."""
if not revenue_str:
return None
try:
revenue_str = revenue_str.strip().upper().replace('$', '').replace(',', '')
multiplier = 1
if revenue_str.endswith('K'):
multiplier = 1_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('M'):
multiplier = 1_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('B'):
multiplier = 1_000_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('T'):
multiplier = 1_000_000_000_000
revenue_str = revenue_str[:-1]
value = float(revenue_str)
return value * multiplier
except (ValueError, AttributeError):
return None
async def extract_quote_data(page, ticker: str = "", debug: bool = False) -> QuoteData:
"""Extract quote/price data from symbol bar.
Args:
page: Playwright page object
ticker: Stock ticker symbol (for pattern matching)
debug: Enable debug logging
Returns:
QuoteData object with extracted fields
"""
quote = QuoteData()
try:
if debug:
logger.debug("Starting quote data extraction...")
# Wait for symbol bar content (look for key labels)
try:
await page.wait_for_selector('#app-symbol-bar-component, text=Previous close', state='attached', timeout=15000)
except Exception:
if debug:
logger.debug("Timeout waiting for symbol bar selector, attempting to parse whatever is there")
# Extract symbol bar text content (fallback to body if specific component not found)
symbol_bar_text = await page.evaluate('''
() => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) return symbolBar.textContent;
// If specific component not found, try to find the container with market data
// Look for container with "Previous close"
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
// Return the parent's text content (go up a few levels to capture all data)
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
return document.body.textContent || '';
}
''')
if debug:
logger.debug(f"Symbol bar text (first 500 chars): {symbol_bar_text[:500]}")
# Extract structured data
quote_data = await page.evaluate(r'''
(ticker) => {
const data = {};
// Helper to get text content from page
const getText = () => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
// Verify it looks like the right component by checking for "Previous close"
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) {
return symbolBar.textContent;
}
// Fallback logic
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
// Last resort: body text
return document.body.textContent || '';
};
const fullText = getText();
// Try to find price in quote container first for accuracy
const priceElement = document.querySelector('.symbol-quote-container, [data-testid="quote-price"]');
if (priceElement) {
const priceText = priceElement.textContent || '';
const priceMatch = priceText.match(/\$([0-9,]+\.[0-9]+)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
} else {
// Fallback regex for price if element not found
// Look for price near top or just regex
const priceMatch = fullText.match(/\$([0-9,]+\.[0-9]{2})(\s|[+-]|$)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
}
// After hours (using \s* for robustness)
const afterHoursMatch = fullText.match(/After hours:?\s*\$([0-9,.]+)/i);
if (afterHoursMatch) data.after_hours_price = afterHoursMatch[1].replace(',', '');
const afterHoursChangeMatch = fullText.match(/After hours:.*?([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
if (afterHoursChangeMatch) {
data.after_hours_change = afterHoursChangeMatch[1].replace('$', '').replace(',', '');
data.after_hours_change_percent = afterHoursChangeMatch[2];
}
// Bid/Ask (using \s* for robustness)
const bidMatch = fullText.match(/Bid\s*\$([0-9,.]+)/i);
if (bidMatch) data.bid = bidMatch[1].replace(',', '');
const askMatch = fullText.match(/Ask\s*\$([0-9,.]+)/i);
if (askMatch) data.ask = askMatch[1].replace(',', '');
const bidAskSizeMatch = fullText.match(/Bid\/Ask Size\s*([0-9]+\/[0-9]+)/i);
if (bidAskSizeMatch) data.bid_ask_size = bidAskSizeMatch[1];
// Previous close and open (using \s* instead of \s+)
const prevCloseMatch = fullText.match(/Previous close\s*\$([0-9,.]+)/i);
if (prevCloseMatch) data.previous_close = prevCloseMatch[1].replace(',', '');
const openMatch = fullText.match(/Today's open\s*\$([0-9,.]+)/i);
if (openMatch) data.open = openMatch[1].replace(',', '');
// Volume (using \s*)
const volumeMatch = fullText.match(/Today's volume\s*([0-9.]+[KMB]?)/i);
if (volumeMatch) data.volume = volumeMatch[1];
const volumeVsAvgMatch = fullText.match(/Today's volume\s*[0-9.]+[KMB]?\s*(Above Avg\.|Below Avg\.|Average)/i);
if (volumeVsAvgMatch) data.volume_vs_avg = volumeVsAvgMatch[1];
// Day range
// Pattern: "Today's range low $200.81 Today's range high $203.45" or similar
// We'll look for "low $X" and "high $Y" appearing after "Today's range"
const dayRangeMatch = fullText.match(/Today's range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (dayRangeMatch) {
data.day_range_low = dayRangeMatch[1].replace(',', '');
data.day_range_high = dayRangeMatch[2].replace(',', '');
}
// 52-week range
const weekRangeMatch = fullText.match(/52-week range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (weekRangeMatch) {
data.week_52_low = weekRangeMatch[1].replace(',', '');
data.week_52_high = weekRangeMatch[2].replace(',', '');
}
// Market cap (may be in Share Profile section)
const marketCapMatch = fullText.match(/Market Cap\s*\$([0-9.]+[KMBT])/i);
if (marketCapMatch) data.market_cap = marketCapMatch[1];
// Change and change percent
// Try specific formatted pattern first: TICKER $PRICE CHANGE CHANGE%
// e.g. "JNJ $201.95 -1.03 -0.51%"
const standardPattern = fullText.match(/\$([0-9,.]+)\s*([+-]?[0-9,.]+)\s*([+-]?[0-9.]+)%/);
if (standardPattern) {
if (!data.price) data.price = standardPattern[1].replace(',', '');
data.change = standardPattern[2];
data.change_percent = standardPattern[3];
}
let percentMatch = null;
if (ticker && !data.change_percent) {
// Match: TICKER$digits.digits{2}percent%
const tickerPattern = new RegExp(ticker + '\\\\.?[\\s]*\\$([0-9,]+\\\\.[0-9]{2})[\\s]*([0-9.]+)%', 'i');
percentMatch = fullText.match(tickerPattern);
if (percentMatch) {
data.change_percent = percentMatch[2];
}
}
if (!data.change_percent) {
// Fallback: match any price+percent pattern with space
const fallbackMatch = fullText.match(/\$[0-9,.]+\s*([+-]?[0-9.]+)%/);
if (fallbackMatch) {
data.change_percent = fallbackMatch[1];
}
}
// Pattern 2: "+$1.23 (+0.45%)" or "-$1.23 (-0.45%)"
let changeMatch = fullText.match(/([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/);
// Pattern 3: "$193.08 +1.23 +0.64%" (price followed by change)
if (!changeMatch) {
changeMatch = fullText.match(/\$[0-9,.]+\s*([+-][0-9,.]+)\s*([+-][0-9.]+)%/);
}
// Pattern 4: "Change: +1.23 (+0.64%)"
if (!changeMatch) {
changeMatch = fullText.match(/Change:?\s*([+-][0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
}
if (changeMatch) {
data.change = changeMatch[1].replace('$', '').replace(',', '');
if (!data.change_percent) {
data.change_percent = changeMatch[2].replace(/[+]/g, '');
}
}
// Exchange - look for NYSE, NASDAQ, etc.
const exchangeMatch = fullText.match(/\b(NYSE|NASDAQ|AMEX|OTC|BATS)\b/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
return data;
}
''', ticker)
# Parse and assign values
quote.price = _parse_float(quote_data.get('price'))
quote.change = _parse_float(quote_data.get('change'))
quote.change_percent = _parse_float(quote_data.get('change_percent'))
quote.after_hours_price = _parse_float(quote_data.get('after_hours_price'))
quote.after_hours_change = _parse_float(quote_data.get('after_hours_change'))
quote.after_hours_change_percent = _parse_float(quote_data.get('after_hours_change_percent'))
quote.bid = _parse_float(quote_data.get('bid'))
quote.ask = _parse_float(quote_data.get('ask'))
quote.bid_ask_size = quote_data.get('bid_ask_size')
quote.previous_close = _parse_float(quote_data.get('previous_close'))
quote.open = _parse_float(quote_data.get('open'))
quote.volume = _parse_volume(quote_data.get('volume', ''))
quote.volume_vs_avg = quote_data.get('volume_vs_avg')
quote.day_range_low = _parse_float(quote_data.get('day_range_low'))
quote.day_range_high = _parse_float(quote_data.get('day_range_high'))
quote.week_52_low = _parse_float(quote_data.get('week_52_low'))
quote.week_52_high = _parse_float(quote_data.get('week_52_high'))
quote.market_cap = quote_data.get('market_cap')
# Try to extract sector and exchange from page header
header_data = await page.evaluate(r'''
() => {
const data = {};
// Look for sector near company name
const sectorElement = document.querySelector('[data-testid="sector"], .sector');
if (sectorElement) {
data.sector = sectorElement.textContent.replace('Sector', '').trim();
} else {
// Manual search for text containing "Sector"
const spans = Array.from(document.querySelectorAll('span'));
const sectorSpan = spans.find(el => el.textContent && el.textContent.includes('Sector'));
if (sectorSpan) {
data.sector = sectorSpan.textContent.replace('Sector', '').replace(':', '').trim();
}
}
// Look for exchange near ticker
const exchangeElement = document.querySelector('[data-testid="exchange"], .exchange');
if (exchangeElement) {
data.exchange = exchangeElement.textContent.trim();
}
// Fallback: parse from page text
const pageText = document.body.textContent || '';
if (!data.sector) {
const sectorMatch = pageText.match(/Sector[:\s]+([A-Za-z\s&]+)/);
if (sectorMatch) data.sector = sectorMatch[1].trim();
}
if (!data.exchange) {
const exchangeMatch = pageText.match(/(NYSE|NASDAQ|AMEX|OTC)/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
}
return data;
}
''')
quote.sector = header_data.get('sector')
quote.exchange = header_data.get('exchange')
if debug:
logger.debug(f"Extracted quote data: price={quote.price}, volume={quote.volume}, "
f"52w_range={quote.week_52_low}-{quote.week_52_high}")
except Exception as e:
if debug:
logger.debug(f"Error extracting quote data: {e}")
return quote
async def extract_enhanced_dividends(page, debug: bool = False) -> EnhancedDividends:
"""Extract enhanced dividend data including next payment dates.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EnhancedDividends object with extracted fields
"""
dividends = EnhancedDividends()
try:
if debug:
logger.debug("Starting enhanced dividend extraction...")
# Wait for dividends panel to load
await page.wait_for_selector('#dividends', timeout=15000)
# Scroll to dividends panel
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking dividends panel header to trigger content load...")
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title, #dividends-togglechevron-button')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked dividends panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click dividends header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Extract dividend data
dividend_data = await page.evaluate('''
() => {
const data = {};
const dividendsPanel = document.querySelector('#dividends');
if (!dividendsPanel) return data;
const fullText = dividendsPanel.textContent || '';
// DEBUG: Return sample of text for debugging
data._debug_text_sample = fullText.substring(0, 800);
// Next dividend payment
const nextPaymentMatch = fullText.match(/Next Dividend Payment\\s*\\$([0-9.]+)/i);
if (nextPaymentMatch) data.next_payment = nextPaymentMatch[1];
// Next pay date
const nextPayDateMatch = fullText.match(/Next Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextPayDateMatch) data.next_pay_date = nextPayDateMatch[1];
// Next ex-date
const nextExDateMatch = fullText.match(/Next Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextExDateMatch) data.next_ex_date = nextExDateMatch[1];
// Previous dividend payment
const prevPaymentMatch = fullText.match(/Previous Dividend Payment\\s*\\$([0-9.]+)/i);
if (prevPaymentMatch) data.previous_payment = prevPaymentMatch[1];
// Previous pay date
const prevPayDateMatch = fullText.match(/Previous Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevPayDateMatch) data.previous_pay_date = prevPayDateMatch[1];
// Previous ex-date
const prevExDateMatch = fullText.match(/Previous Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevExDateMatch) data.previous_ex_date = prevExDateMatch[1];
// Frequency
const frequencyMatch = fullText.match(/Frequency\\s*(Quarterly|Monthly|Annual|Semi-Annual)/i);
if (frequencyMatch) data.frequency = frequencyMatch[1];
// Annual Dividend Rate (IAD)
const annualRateMatch = fullText.match(/Annual Dividend Rate.*?\\$([0-9.]+)/i);
if (annualRateMatch) data.annual_rate = annualRateMatch[1];
// Annual Dividend Yield - appears after "Annual Dividend Yield" text
// Text pattern: "Annual Dividend Yield...2.71%"
const yieldMatch = fullText.match(/Annual Dividend Yield[\\s\\S]{0,300}?([0-9]+\\.[0-9]+)%/i);
if (yieldMatch) data.annual_yield = yieldMatch[1];
return data;
}
''')
if debug and dividend_data.get('_debug_text_sample'):
logger.debug(f"Dividend panel text sample: {dividend_data['_debug_text_sample']}")
# Parse and assign values
dividends.next_payment = _parse_float(dividend_data.get('next_payment'))
dividends.next_pay_date = dividend_data.get('next_pay_date')
dividends.next_ex_date = dividend_data.get('next_ex_date')
dividends.previous_payment = _parse_float(dividend_data.get('previous_payment'))
dividends.previous_pay_date = dividend_data.get('previous_pay_date')
dividends.previous_ex_date = dividend_data.get('previous_ex_date')
dividends.frequency = dividend_data.get('frequency')
dividends.annual_rate = _parse_float(dividend_data.get('annual_rate'))
dividends.annual_yield = _parse_float(dividend_data.get('annual_yield'))
if debug:
logger.debug(f"Extracted dividend data: next_payment={dividends.next_payment}, "
f"next_pay_date={dividends.next_pay_date}, annual_rate={dividends.annual_rate}")
except Exception as e:
if debug:
logger.debug(f"Error extracting dividend data: {e}")
return dividends
async def extract_earnings_data(page, debug: bool = False) -> EarningsData:
"""Extract earnings metrics and forecasts.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EarningsData object with extracted fields
"""
earnings = EarningsData()
try:
if debug:
logger.debug("Starting earnings data extraction...")
# Wait for earnings panel to load
await page.wait_for_selector('#expected-earnings', timeout=15000)
# Scroll to earnings panel
await page.evaluate('''
() => {
const earningsPanel = document.querySelector('#expected-earnings');
if (earningsPanel) {
earningsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking earnings panel header to trigger content load...")
try:
earnings_header = await page.query_selector('#expected-earnings h2, #expected-earnings .sdps-panel__title, #expected-earnings-heading, #expected-earnings-togglechevron-button')
if earnings_header:
await earnings_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked earnings panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click earnings header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Check for and click "Show More" if present
try:
# Use JS to find and click - most robust way
clicked = await page.evaluate('''
() => {
const panel = document.querySelector('#expected-earnings');
if (!panel) return false;
// Find any element with "Show More" text
const elements = Array.from(panel.querySelectorAll('a, button, span, div'));
const showMore = elements.find(el => el.textContent.trim().toLowerCase() === "show more");
if (showMore) {
showMore.click();
return true;
}
return false;
}
''')
if clicked:
if debug:
logger.debug("found and clicked 'Show More' via JS")
await page.wait_for_timeout(2000)
elif debug:
logger.debug("'Show More' not found or not clickable")
except Exception as e:
if debug:
logger.debug(f"Error checking for Show More: {e}")
# Extract earnings data
earnings_data = await page.evaluate(r'''
(debug) => {
const data = {};
// Helper to get text content including Shadow DOMs
const getDeepText = (root) => {
if (!root) return '';
if (root.nodeType === Node.TEXT_NODE) return root.textContent;
if (root.nodeType === Node.ELEMENT_NODE && root.shadowRoot) {
return getDeepText(root.shadowRoot);
}
let text = '';
const children = root.childNodes;
for (let i = 0; i < children.length; i++) {
text += getDeepText(children[i]);
}
return text;
};
const earningsPanel = document.querySelector('#expected-earnings');
let fullText = '';
if (earningsPanel) {
fullText = getDeepText(earningsPanel);
}
// Fallback to body deep text if panel seems empty
if (fullText.length < 500 || !fullText.includes("Announcement")) {
fullText = getDeepText(document.body);
}
// Next earnings announcement - robust regex checking for various patterns
let nextAnnouncementMatch = fullText.match(/Next Earnings Announcement.*?([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
if (!nextAnnouncementMatch) {
// Try alternate pattern: Announcement: 12/12/2025
nextAnnouncementMatch = fullText.match(/Announcement:?\s*([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
}
if (nextAnnouncementMatch) data.next_announcement_date = nextAnnouncementMatch[1];
// Announcement timing
const timingMatch = fullText.match(/(Before Market Open|After Market Close)/i);
if (timingMatch) data.announcement_timing = timingMatch[1];
// Number of analysts
const analystsMatch = fullText.match(/With ([0-9]+) analysts covering/i);
if (analystsMatch) data.analysts_covering = analystsMatch[1];
// Consensus estimate
const consensusMatch = fullText.match(/consensus.*?estimate is \\$([0-9.]+)/i);
if (consensusMatch) data.consensus_estimate = consensusMatch[1];
// High/Low estimates
const highLowMatch = fullText.match(/high and low estimates are \\$([0-9.]+) and \\$([0-9.]+)/i);
if (highLowMatch) {
data.estimate_high = highLowMatch[1];
data.estimate_low = highLowMatch[2];
}
// EPS TTM (multiple patterns)
let epsMatch = fullText.match(/EPS\s*\(TTM\)\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/Earnings per Share\s*\(?TTM\)?\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/EPS\s+(?:Value)?\s*([0-9.-]+)/i);
if (epsMatch) data.eps_ttm = epsMatch[1];
// Revenue TTM
let revenueMatch = fullText.match(/Revenue\s*\(TTM\)\s*(?:Value)?\s*\$([0-9.]+[KMBT]?)/i);
if (!revenueMatch) revenueMatch = fullText.match(/Revenue\s+(?:Value)?\s*\$([0-9.]+[KMBT])/i);
if (revenueMatch) data.revenue_ttm = revenueMatch[1];
// P/E TTM (multiple patterns)
let peMatch = fullText.match(/Price[\/\s]*Earnings\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/P[\/\s]*E\s*\(?TTM\)?\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/PE Ratio\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (peMatch) data.pe_ttm = peMatch[1];
// Forward P/E
let forwardPeMatch = fullText.match(/Forward\s+P[\/\s]*E\s*(?:Value)?\s*([0-9.]+)/i);
if (!forwardPeMatch) forwardPeMatch = fullText.match(/P[\/\s]*E\s*\(Forward\)\s*(?:Value)?\s*([0-9.]+)/i);
if (forwardPeMatch) data.forward_pe = forwardPeMatch[1];
// PEG Ratio
let pegMatch = fullText.match(/Price\s+to\s+Earnings[\/\s]*Growth\s*\(PEG\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!pegMatch) pegMatch = fullText.match(/PEG\s*Ratio?\s*(?:Value)?\s*([0-9.]+)/i);
if (pegMatch) data.peg_ratio = pegMatch[1];
// Recent beats/misses (simplified - just extract beat amounts)
const beatMatches = fullText.matchAll(/Beat.*?\$([0-9.]+)/gi);
data.recent_beats = [];
for (const match of beatMatches) {
data.recent_beats.push(match[1]);
}
return data;
}
''', debug)
# Parse and assign values
earnings.next_announcement_date = earnings_data.get('next_announcement_date')
earnings.announcement_timing = earnings_data.get('announcement_timing')
earnings.analysts_covering = _parse_int(earnings_data.get('analysts_covering'))
earnings.consensus_estimate = _parse_float(earnings_data.get('consensus_estimate'))
earnings.estimate_high = _parse_float(earnings_data.get('estimate_high'))
earnings.estimate_low = _parse_float(earnings_data.get('estimate_low'))
earnings.eps_ttm = _parse_float(earnings_data.get('eps_ttm'))
earnings.revenue_ttm = _parse_revenue(earnings_data.get('revenue_ttm', ''))
earnings.pe_ttm = _parse_float(earnings_data.get('pe_ttm'))
earnings.forward_pe = _parse_float(earnings_data.get('forward_pe'))
earnings.peg_ratio = _parse_float(earnings_data.get('peg_ratio'))
# Store recent beats as list of dicts
if earnings_data.get('recent_beats'):
earnings.recent_beats = [
{'beat_amount': _parse_float(beat)}
for beat in earnings_data.get('recent_beats', [])
]
if debug:
logger.debug(f"Extracted earnings data: eps_ttm={earnings.eps_ttm}, "
f"pe_ttm={earnings.pe_ttm}, forward_pe={earnings.forward_pe}")
except Exception as e:
if debug:
logger.debug(f"Error extracting earnings data: {e}")
return earnings
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
Args:
annual_dividend: Annual dividend rate per share
eps_ttm: Earnings per share (trailing twelve months)
Returns:
Payout ratio as percentage, or None if cannot calculate
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def extract_phase1_data(page, debug: bool = False) -> EquityPhase1Data:
"""Extract all Phase 1 data points.
Args:
page: Playwright page object
debug: Enable debug output
Returns:
EquityPhase1Data object with all extracted data
"""
if debug:
logger.debug("Starting Phase 1 data extraction...")
# Wait for page to stabilize
await page.wait_for_timeout(3000)
# Extract ticker from page URL
ticker = await page.evaluate('''
() => {
const url = window.location.href;
const match = url.match(/stocks\\/([A-Z]+)/i);
return match ? match[1].toUpperCase() : '';
}
''')
# Extract each section
quote = await extract_quote_data(page, ticker=ticker, debug=debug)
dividends = await extract_enhanced_dividends(page, debug=debug)
earnings = await extract_earnings_data(page, debug=debug)
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"Phase 1 extraction complete for {ticker}")
return phase1_data

View File

@@ -0,0 +1,977 @@
from typing import Dict, Any, Optional
from ...utils.logging import save_debug_artifact
def should_replace_dividend_value(existing_value: Optional[str], new_value: Optional[str]) -> bool:
"""
Decide whether to replace an existing dividend field value with a new one.
Rules:
- Never replace with empty/None values
- Replace if there is no existing value
- Replace if the existing value is "Show More" or contains "Show More"
- Otherwise, keep the existing (good) data
"""
if not new_value or not str(new_value).strip():
return False
if not existing_value:
return True
existing_text = str(existing_value)
if existing_text == 'Show More' or 'Show More' in existing_text:
return True
return False
async def extract_dividend_data(page, debug: bool = False) -> Dict[str, Any]:
"""
Extract dividend information from Schwab stock page.
Returns dictionary with dividend data fields.
"""
dividend_data: Dict[str, Any] = {}
try:
if debug:
print("DEBUG: Starting dividend data extraction...")
# Take initial screenshot to see page state
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_start.png", png)
print(f"DEBUG: Initial screenshot saved as {path}")
# Wait for the dividends section to load dynamically
if debug:
print("DEBUG: Waiting for dividends section to load...")
try:
# First wait for the dividends panel to appear
await page.wait_for_selector('#dividends', timeout=15000)
if debug:
print("DEBUG: #dividends panel found")
# Wait for dividend content to load dynamically
dividend_loaded = False
max_attempts = 5 # Reduced from 10 for faster tests
attempt = 0
while not dividend_loaded and attempt < max_attempts:
attempt += 1
if debug:
print(f"DEBUG: Attempt {attempt}/{max_attempts} - Waiting for dynamic dividend content...")
# Check if the dividends section has been populated with actual content
dividend_status = await page.evaluate('''
() => {
const result = { loaded: false, debug: {} };
// Look for the dividends panel content that should be populated
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const textContent = panelBody.textContent || '';
result.debug.panelBodyLength = textContent.length;
result.debug.panelBodySample = textContent.substring(0, 200);
// Check if the panel has been populated with actual dividend text
// (not just empty comments)
const hasRealContent = textContent.length > 50 && (
textContent.includes('Previous Dividend') ||
textContent.includes('Pay Date') ||
textContent.includes('Ex-Date') ||
textContent.includes('Frequency') ||
textContent.includes('Annual Dividend') ||
textContent.includes('$') ||
textContent.includes('%')
);
if (hasRealContent) {
result.loaded = true;
return result;
}
}
}
// Alternative: check for stock-dividends component
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
const text = stockDividends.textContent || '';
result.debug.stockDividendsLength = text.length;
result.debug.stockDividendsSample = text.substring(0, 100);
if (text.length > 20 && text.includes('$')) {
result.loaded = true;
return result;
}
}
// Alternative: check for any elements with dividend-related content
const allElements = document.querySelectorAll('#dividends *');
result.debug.totalElements = allElements.length;
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.includes('Previous Dividend Payment') ||
(text.includes('$') && text.includes('.'))) {
result.loaded = true;
result.debug.foundInElement = elem.tagName + '.' + elem.className;
return result;
}
}
return result;
}
''')
if debug:
print(f"DEBUG: Dividend status: {dividend_status}")
dividend_loaded = dividend_status.get('loaded', False)
if dividend_loaded:
if debug:
print("DEBUG: Dynamic dividend content loaded!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_content_loaded.png", png)
print(f"DEBUG: Screenshot after content loaded: {path}")
break
# Wait between attempts to allow for async loading
await page.wait_for_timeout(1000) # Reduced from 2000ms for faster tests
if not dividend_loaded:
if debug:
print("DEBUG: Basic dividend content did not auto-load - this suggests the page is not behaving as expected")
print("DEBUG: Expected behavior: Basic dividend info should be visible without clicking 'Show More'")
# Try to force a page refresh or trigger loading
print("DEBUG: Attempting to trigger dividend content loading...")
try:
# Try scrolling to the dividend section to trigger lazy loading
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(3000)
# Try clicking on the dividends panel header to ensure it's active
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
print("DEBUG: Clicked on dividends panel header")
except:
pass
# Check one more time if content loaded
final_status = await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const textContent = panelBody.textContent || '';
return {
length: textContent.length,
sample: textContent.substring(0, 500),
hasBasicData: textContent.includes('$') && (
textContent.includes('Previous') ||
textContent.includes('Pay Date') ||
textContent.includes('Ex-Date')
)
};
}
}
return { length: 0, sample: '', hasBasicData: false };
}
''')
if debug:
print(f"DEBUG: Final dividend panel status: {final_status}")
if final_status.get('hasBasicData'):
print("DEBUG: Basic dividend data now detected after manual triggering!")
dividend_loaded = True
# Extract the data immediately while it's loaded
immediate_extraction = await page.evaluate(r'''
() => {
const results = {};
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const fullText = panelBody.textContent || '';
// Extract data using pattern matching from the full text
const patterns = {
'Previous Dividend Payment': /Previous Dividend Payment\s*\$([0-9]+\.[0-9]+)/,
'Previous Pay Date': /Previous Pay Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/,
'Previous Ex-Date': /Previous Ex-Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/,
'Frequency': /Frequency\s*([A-Za-z]+)/,
'Annual Dividend Rate': /(?:Annual Dividend Rate|IAD).*?\$([0-9]+\.[0-9]+)/,
'Annual Dividend Yield': /([0-9]+\.[0-9]+%)(?=\s|Annual|$)/
};
for (const [field, pattern] of Object.entries(patterns)) {
const match = fullText.match(pattern);
if (match) {
if (field === 'Previous Dividend Payment' || field === 'Annual Dividend Rate') {
results[field] = '$' + match[1];
} else {
results[field] = match[1];
}
}
}
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Immediate extraction results: {immediate_extraction}")
if immediate_extraction:
dividend_data.update(immediate_extraction)
# Clean up the Frequency field if it has extra text
if 'Frequency' in dividend_data and 'Quarterly' in dividend_data['Frequency']:
dividend_data['Frequency'] = 'Quarterly'
except Exception as e:
if debug:
print(f"DEBUG: Error during manual triggering: {e}")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_timeout.png", png)
print(f"DEBUG: Screenshot after timeout: {path}")
except Exception as e:
if debug:
print(f"DEBUG: Error waiting for dividend content: {e}")
# Check for dividend grid directly without clicking
if debug:
print("DEBUG: Checking for #dividend-grid...")
dividend_grid_found = False
try:
await page.wait_for_selector('#dividend-grid', timeout=10000)
dividend_grid_found = True
if debug:
print("DEBUG: #dividend-grid found!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_grid_found.png", png)
print(f"DEBUG: Screenshot with dividend grid: {path}")
except:
if debug:
print("DEBUG: #dividend-grid not found initially")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_no_grid.png", png)
print(f"DEBUG: Screenshot without grid: {path}")
# Try to scroll to the dividend section to ensure it's in view
if debug:
print("DEBUG: Scrolling to stock-dividends component...")
try:
await page.evaluate('''
() => {
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
stockDividends.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(3000)
if debug:
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_after_scroll.png", png)
print(f"DEBUG: Screenshot after scroll: {path}")
# Check again for dividend grid after scrolling
try:
await page.wait_for_selector('#dividend-grid', timeout=5000)
dividend_grid_found = True
if debug:
print("DEBUG: #dividend-grid found after scroll!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_grid_after_scroll.png", png)
print(f"DEBUG: Screenshot with grid after scroll: {path}")
except:
if debug:
print("DEBUG: #dividend-grid still not found after scroll")
except Exception as e:
if debug:
print(f"DEBUG: Error during scroll attempt: {e}")
# Common dividend section selectors used by financial websites
dividend_selectors = [
'#dividend-grid', # Primary target based on user feedback
'stock-dividends', # Secondary target - the web component
'#dividend-section',
'#dividends-section',
'.dividend-summary',
'.dividends-summary',
'div[data-testid*="dividend"]',
'div[aria-label*="dividend"]',
'[class*="dividend"]',
'section:has-text("Dividend")',
'div:has-text("Previous Dividend Payment")'
]
# Try to find dividend section
dividend_section = None
for selector in dividend_selectors:
try:
if await page.is_visible(selector):
dividend_section = selector
if debug:
print(f"DEBUG: Found dividend section with selector: {selector}")
break
except:
continue
if not dividend_section:
if debug:
print("DEBUG: No dividend section found, trying broader search...")
# In debug mode, capture the page content to help identify selectors
page_content = await page.content()
path_html = save_debug_artifact("debug_dividend_page.html", page_content)
print(f"DEBUG: Page HTML saved to {path_html} for analysis")
# Also save a screenshot to see the visual layout
png = await page.screenshot(full_page=True)
path_png = save_debug_artifact("debug_dividend_page.png", png)
print(f"DEBUG: Page screenshot saved to {path_png}")
# Fallback: look for dividend-related text anywhere on page
dividend_text_exists = await page.evaluate('''
() => {
const text = document.body.innerText.toLowerCase();
return text.includes('dividend') || text.includes('ex-date') || text.includes('pay date') || text.includes('previous dividend') || text.includes('iad');
}
''')
if debug:
print(f"DEBUG: Dividend-related text found on page: {dividend_text_exists}")
# Try scrolling down to reveal more content
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await page.wait_for_timeout(2000)
# Extract all text content that might contain dividend info
dividend_related_text = await page.evaluate('''
() => {
const text = document.body.innerText;
const lines = text.split('\n');
const dividendLines = lines.filter(line => {
const lower = line.toLowerCase();
return lower.includes('dividend') || lower.includes('ex-date') ||
lower.includes('pay date') || lower.includes('previous') ||
lower.includes('iad') || lower.includes('frequency') ||
lower.includes('quarterly') || lower.includes('$0.26') ||
lower.includes('0.4865%') || lower.includes('$1.04') ||
lower.includes('annual dividend') || lower.includes('yield');
});
return dividendLines;
}
''')
print(f"DEBUG: Found dividend-related text lines: {dividend_related_text}")
# Try a more comprehensive search for dividend data
all_dividend_info = await page.evaluate('''
() => {
// Look for elements containing common dividend field names
const fieldNames = [
'Previous Dividend Payment', 'Next Dividend Payment',
'Previous Pay Date', 'Next Pay Date',
'Previous Ex-Date', 'Next Ex-Date', 'Ex-Date',
'Frequency', 'Annual Dividend Rate', 'IAD',
'Annual Dividend Yield', 'Dividend Yield'
];
const results = {};
fieldNames.forEach(fieldName => {
// Search for elements containing this field name
const elements = Array.from(document.querySelectorAll('*')).filter(el =>
el.textContent && el.textContent.includes(fieldName) &&
el.children.length === 0 // Text nodes only
);
elements.forEach(el => {
// Look for value in nearby elements
const parent = el.parentElement;
if (parent) {
const siblings = Array.from(parent.children);
const currentIndex = siblings.indexOf(el);
// Check next siblings for values
for (let i = currentIndex + 1; i < siblings.length; i++) {
const sibling = siblings[i];
const text = sibling.textContent.trim();
if (text && text !== fieldName && text.length > 0 && text.length < 50) {
results[fieldName] = text;
break;
}
}
// Check same element for values after the field name
const fullText = el.textContent;
const fieldIndex = fullText.indexOf(fieldName);
if (fieldIndex >= 0) {
const afterField = fullText.substring(fieldIndex + fieldName.length).trim();
if (afterField && afterField.length > 0 && afterField.length < 50) {
results[fieldName] = afterField;
}
}
}
});
});
return results;
}
''')
print(f"DEBUG: Comprehensive dividend search results: {all_dividend_info}")
# If we found data in the comprehensive search, use it only if we don't already have good data
if all_dividend_info:
for field, value in all_dividend_info.items():
if value and value.strip():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value.strip()
if debug:
print(f"DEBUG: Added dividend field from comprehensive search: {field} = {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring comprehensive search value: {value})")
if not dividend_text_exists:
if debug:
print("DEBUG: No dividend-related content found on page")
return dividend_data
# Use body as fallback section for broad search
dividend_section = 'body'
if debug:
print("DEBUG: Using body as dividend section for broad search")
# If we found the dividend grid, use specific selectors based on user feedback
if dividend_section == '#dividend-grid':
if debug:
print("DEBUG: Using specific dividend grid selectors...")
try:
# First check if dividend grid is actually present and populated
grid_status = await page.evaluate('''
() => {
const dividendGrid = document.querySelector('#dividend-grid');
if (!dividendGrid) return { found: false, message: 'No #dividend-grid element found' };
const textContent = dividendGrid.textContent || '';
const hasContent = textContent.trim().length > 50;
const childCount = dividendGrid.children.length;
return {
found: true,
hasContent,
textLength: textContent.length,
childCount,
preview: textContent.substring(0, 200),
message: `Grid found with ${childCount} children, ${textContent.length} chars`
};
}
''')
if debug:
print(f"DEBUG: Dividend grid status: {grid_status}")
# Extract dividend data using improved selectors
specific_dividend_data = await page.evaluate(r'''
() => {
const results = {};
// Check if dividend grid exists and has content
const dividendGrid = document.querySelector('#dividend-grid');
if (dividendGrid) {
const allGridText = dividendGrid.textContent || '';
const lines = allGridText.split('\n').map(line => line.trim()).filter(line => line.length > 0);
// Try structured approach first - look for rows/cells
const dividendRows = dividendGrid.querySelectorAll('div[class*="row"], tr, .dividend-row, div:has(div)');
dividendRows.forEach((row, rowIndex) => {
const rowText = row.textContent || '';
// Look for dividend payment info
if (rowText.includes('Dividend Payment') || (rowText.includes('Previous') && rowText.includes('$'))) {
const amountMatch = rowText.match(/\$[0-9]+\.[0-9]+/);
if (amountMatch && !results['Previous Dividend Payment']) {
results['Previous Dividend Payment'] = amountMatch[0];
}
// Look for dates in the same row
const dateMatches = rowText.match(/([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/g);
if (dateMatches) {
if (dateMatches.length >= 1 && !results['Previous Pay Date']) results['Previous Pay Date'] = dateMatches[0];
if (dateMatches.length >= 2 && !results['Previous Ex-Date']) results['Previous Ex-Date'] = dateMatches[1];
}
}
});
// Fallback: Parse all lines systematically
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = i + 1 < lines.length ? lines[i + 1] : '';
// Match dividend payment
if ((line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) && !results['Previous Dividend Payment']) {
const amountPattern = /\$[0-9]+\.[0-9]+/;
let amount = line.match(amountPattern) || nextLine.match(amountPattern);
if (amount) results['Previous Dividend Payment'] = amount[0];
}
// Match pay date
if (line.includes('Pay Date') && !results['Previous Pay Date']) {
const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/;
let date = line.match(datePattern) || nextLine.match(datePattern);
if (date) results['Previous Pay Date'] = date[0];
}
// Match ex-date
if (line.includes('Ex-Date') && !results['Previous Ex-Date']) {
const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/;
let date = line.match(datePattern) || nextLine.match(datePattern);
if (date) results['Previous Ex-Date'] = date[0];
}
// Match frequency
if (line.includes('Frequency') && !results['Frequency']) {
const freqLine = line + ' ' + nextLine;
if (freqLine.toLowerCase().includes('quarterly')) results['Frequency'] = 'Quarterly';
else if (freqLine.toLowerCase().includes('monthly')) results['Frequency'] = 'Monthly';
else if (freqLine.toLowerCase().includes('annual')) results['Frequency'] = 'Annual';
else if (freqLine.toLowerCase().includes('semi')) results['Frequency'] = 'Semi-Annual';
}
// Match annual dividend rate
if ((line.includes('Annual Dividend Rate') || line.includes('IAD')) && !results['Annual Dividend Rate']) {
const amountPattern = /\$[0-9]+\.[0-9]+/;
let amount = line.match(amountPattern) || nextLine.match(amountPattern);
if (amount) results['Annual Dividend Rate'] = amount[0];
}
// Match annual dividend yield
if (line.includes('Annual Dividend Yield') && !results['Annual Dividend Yield']) {
const percentPattern = /[0-9]+\.[0-9]+%/;
let percent = line.match(percentPattern) || nextLine.match(percentPattern);
if (percent) results['Annual Dividend Yield'] = percent[0];
}
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Specific dividend grid extraction results: {specific_dividend_data}")
# Add the extracted data to dividend_data only if we don't already have good data
if specific_dividend_data:
for field, value in specific_dividend_data.items():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value
if debug:
print(f"DEBUG: Updated {field} from specific extraction: {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring specific extraction value: {value})")
except Exception as e:
if debug:
print(f"DEBUG: Error in specific dividend grid extraction: {e}")
# Extract dividend data using the correct structure from gemini analysis
if debug:
print("DEBUG: Extracting dividend data from dividend-grid structure...")
# First try to extract data from the dynamically loaded dividend content
try:
dividend_dynamic_data = await page.evaluate(r'''
() => {
const results = {};
// Strategy 1: Look for any dividend grid structure that was loaded
const dividendGrid = document.querySelector('#dividend-grid');
if (dividendGrid) {
const rows = dividendGrid.querySelectorAll('div.sdps-row, .row');
for (let row of rows) {
const cells = row.querySelectorAll('div[class*="col-"]');
if (cells.length >= 2) {
const label = cells[0].textContent.trim();
const value = cells[1].textContent.trim();
// Map the labels to our expected field names
if (label.includes('Previous Dividend Payment') || label.includes('Dividend Payment')) {
results['Previous Dividend Payment'] = value;
} else if (label.includes('Previous Pay Date') || label.includes('Pay Date')) {
results['Previous Pay Date'] = value;
} else if (label.includes('Previous Ex-Date') || label.includes('Ex-Date')) {
results['Previous Ex-Date'] = value;
} else if (label.includes('Frequency')) {
results['Frequency'] = value;
} else if (label.includes('Annual Dividend Rate') || label.includes('IAD')) {
results['Annual Dividend Rate'] = value;
} else if (label.includes('Annual Dividend Yield')) {
results['Annual Dividend Yield'] = value;
}
}
}
if (Object.keys(results).length > 0) {
return results;
}
}
// Strategy 2: Look for stock-dividends component content
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
const allText = stockDividends.textContent || '';
const lines = allText.split('\n').map(line => line.trim()).filter(line => line);
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = i + 1 < lines.length ? lines[i + 1] : '';
if (line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) {
const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) results['Previous Dividend Payment'] = amountMatch[0];
} else if (line.includes('Pay Date')) {
const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/);
if (dateMatch) results['Previous Pay Date'] = dateMatch[0];
} else if (line.includes('Ex-Date')) {
const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/);
if (dateMatch) results['Previous Ex-Date'] = dateMatch[0];
} else if (line.includes('Frequency')) {
if (line.toLowerCase().includes('quarterly') || nextLine.toLowerCase().includes('quarterly')) {
results['Frequency'] = 'Quarterly';
} else if (line.toLowerCase().includes('monthly') || nextLine.toLowerCase().includes('monthly')) {
results['Frequency'] = 'Monthly';
} else if (line.toLowerCase().includes('annual') || nextLine.toLowerCase().includes('annual')) {
results['Frequency'] = 'Annual';
}
} else if (line.includes('Annual Dividend Rate') || line.includes('IAD')) {
const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) results['Annual Dividend Rate'] = amountMatch[0];
} else if (line.includes('Annual Dividend Yield')) {
const percentMatch = (line + ' ' + nextLine).match(/[0-9]+\.[0-9]+%/);
if (percentMatch) results['Annual Dividend Yield'] = percentMatch[0];
}
}
if (Object.keys(results).length > 0) {
return results;
}
}
// Strategy 3: Look within entire dividends panel for any structured content
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const allElements = dividendsPanel.querySelectorAll('*');
for (let elem of allElements) {
const text = elem.textContent || '';
// Look for dollar amounts near dividend-related text
if (text.includes('Previous Dividend Payment') || text.includes('Dividend Payment')) {
const parent = elem.parentElement;
if (parent) {
const siblings = Array.from(parent.children);
const currentIndex = siblings.indexOf(elem);
// Check next siblings for values
for (let j = currentIndex + 1; j < siblings.length; j++) {
const sibling = siblings[j];
const siblingText = sibling.textContent.trim();
const amountMatch = siblingText.match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) {
results['Previous Dividend Payment'] = amountMatch[0];
break;
}
}
}
}
// Similar logic for other fields...
// (truncated for brevity but would include Pay Date, Ex-Date, etc.)
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Dynamic dividend extraction results: {dividend_dynamic_data}")
if dividend_dynamic_data:
for field, value in dividend_dynamic_data.items():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value
if debug:
print(f"DEBUG: Updated {field} from dynamic extraction: {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring dynamic extraction value: {value})")
except Exception as e:
if debug:
print(f"DEBUG: Error in dynamic dividend extraction: {e}")
# Define dividend fields and their possible selectors as fallback
dividend_fields = {
'Previous Dividend Payment': [
'#dividend-grid div:has-text("Previous Dividend Payment") ~ div',
'#dividend-grid div:has-text("Dividend Payment") ~ div',
'#dividends span:has-text("Previous Dividend Payment") + span',
'#dividends div:has-text("Previous Dividend Payment") + div',
'#dividends *:has-text("Previous Dividend Payment") ~ *',
'stock-dividends span:has-text("Previous Dividend Payment") + span',
'stock-dividends div:has-text("Previous Dividend Payment") + div',
'span:has-text("Previous Dividend Payment") + span',
'div:has-text("Previous Dividend Payment") + div',
'*:has-text("Previous Dividend Payment") ~ *',
'span:has-text("Next Dividend Payment") + span',
'div:has-text("Next Dividend Payment") + div',
'*:has-text("Next Dividend Payment") ~ *',
'[data-field="dividend-payment"]',
'.dividend-payment'
],
'Previous Pay Date': [
'#dividend-grid div:has-text("Previous Pay Date") ~ div',
'#dividend-grid div:has-text("Pay Date") ~ div',
'#dividends span:has-text("Previous Pay Date") + span',
'#dividends div:has-text("Previous Pay Date") + div',
'#dividends *:has-text("Previous Pay Date") ~ *',
'stock-dividends span:has-text("Previous Pay Date") + span',
'stock-dividends div:has-text("Previous Pay Date") + div',
'span:has-text("Previous Pay Date") + span',
'div:has-text("Previous Pay Date") + div',
'*:has-text("Previous Pay Date") ~ *',
'span:has-text("Next Pay Date") + span',
'div:has-text("Next Pay Date") + div',
'*:has-text("Next Pay Date") ~ *',
'*:has-text("Pay Date") ~ *',
'[data-field="pay-date"]',
'.pay-date'
],
'Previous Ex-Date': [
'#dividend-grid div:has-text("Previous Ex-Date") ~ div',
'#dividend-grid div:has-text("Ex-Date") ~ div',
'#dividends span:has-text("Previous Ex-Date") + span',
'#dividends div:has-text("Previous Ex-Date") + div',
'#dividends *:has-text("Previous Ex-Date") ~ *',
'stock-dividends span:has-text("Previous Ex-Date") + span',
'stock-dividends div:has-text("Previous Ex-Date") + div',
'span:has-text("Previous Ex-Date") + span',
'div:has-text("Previous Ex-Date") + div',
'*:has-text("Previous Ex-Date") ~ *',
'span:has-text("Next Ex-Date") + span',
'div:has-text("Next Ex-Date") + div',
'*:has-text("Next Ex-Date") ~ *',
'*:has-text("Ex-Date") ~ *',
'[data-field="ex-date"]',
'.ex-date'
],
'Frequency': [
'#dividend-grid div:has-text("Frequency") ~ div',
'#dividends span:has-text("Frequency") + span',
'#dividends div:has-text("Frequency") + div',
'#dividends *:has-text("Frequency") ~ *',
'stock-dividends span:has-text("Frequency") + span',
'stock-dividends div:has-text("Frequency") + div',
'span:has-text("Frequency") + span',
'div:has-text("Frequency") + div',
'*:has-text("Frequency") ~ *',
'[data-field="frequency"]',
'.dividend-frequency',
'.frequency'
],
'Annual Dividend Rate': [
'#dividend-grid div:has-text("Annual Dividend Rate") ~ div',
'#dividend-grid div:has-text("IAD") ~ div',
'#dividends span:has-text("Annual Dividend Rate") + span',
'#dividends div:has-text("Annual Dividend Rate") + div',
'#dividends *:has-text("Annual Dividend Rate") ~ *',
'#dividends span:has-text("IAD") + span',
'#dividends *:has-text("IAD") ~ *',
'stock-dividends span:has-text("Annual Dividend Rate") + span',
'stock-dividends div:has-text("Annual Dividend Rate") + div',
'stock-dividends span:has-text("IAD") + span',
'span:has-text("Annual Dividend Rate") + span',
'div:has-text("Annual Dividend Rate") + div',
'*:has-text("Annual Dividend Rate") ~ *',
'span:has-text("IAD") + span',
'*:has-text("IAD") ~ *',
'[data-field="annual-rate"]',
'.annual-dividend-rate'
],
'Annual Dividend Yield': [
'#dividend-grid div:has-text("Annual Dividend Yield") ~ div',
'#dividends span:has-text("Annual Dividend Yield") + span',
'#dividends div:has-text("Annual Dividend Yield") + div',
'#dividends *:has-text("Annual Dividend Yield") ~ *',
'stock-dividends span:has-text("Annual Dividend Yield") + span',
'stock-dividends div:has-text("Annual Dividend Yield") + div',
'span:has-text("Annual Dividend Yield") + span',
'div:has-text("Annual Dividend Yield") + div',
'*:has-text("Annual Dividend Yield") ~ *',
'[data-field="dividend-yield"]',
'.dividend-yield'
]
}
# Extract each dividend field using multiple selector strategies
for field_name, selectors in dividend_fields.items():
field_found = False
# Try each selector for this field
for selector in selectors:
if field_found:
break
try:
# Scope search within dividend section if found, otherwise search whole page
full_selector = f'{dividend_section} {selector}' if dividend_section != 'body' else selector
if await page.is_visible(full_selector, timeout=1000):
value = await page.inner_text(full_selector)
clean_value = value.strip()
if clean_value and clean_value != field_name: # Ensure we got actual value, not the label
existing_value = dividend_data.get(field_name, '')
if should_replace_dividend_value(existing_value, clean_value):
dividend_data[field_name] = clean_value
field_found = True
if debug:
print(f"DEBUG: Found {field_name}: {clean_value} (selector: {full_selector})")
elif debug:
print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring selector-based value: {clean_value})")
break
except:
continue
# If standard selectors failed, try JavaScript-based text search as fallback
if not field_found:
try:
# Try multiple variations of the field name
search_terms = [field_name]
if "Previous" in field_name:
search_terms.append(field_name.replace("Previous", "Next"))
if "Annual Dividend Rate" in field_name:
search_terms.append("IAD")
if "Annual Dividend Yield" in field_name:
search_terms.append("Dividend Yield")
for search_term in search_terms:
if field_found:
break
value = await page.evaluate(rf'''
() => {{
const searchText = "{search_term}";
// First check within the dividends section specifically
const dividendsPanel = document.querySelector('#dividends');
const stockDividends = document.querySelector('stock-dividends');
const searchContainers = [dividendsPanel, stockDividends, document];
for (let container of searchContainers) {{
if (!container) continue;
const elements = Array.from(container.querySelectorAll('*'));
for (let elem of elements) {{
if (elem.textContent && elem.textContent.includes(searchText)) {{
// Look for next sibling or nearby element with value
let candidate = elem.nextElementSibling;
if (candidate && candidate.textContent &&
!candidate.textContent.includes(searchText) &&
candidate.textContent.trim().length > 0) {{
return candidate.textContent.trim();
}}
// Try parent's next sibling
candidate = elem.parentElement?.nextElementSibling;
if (candidate && candidate.textContent &&
!candidate.textContent.includes(searchText) &&
candidate.textContent.trim().length > 0) {{
return candidate.textContent.trim();
}}
// Try looking in the same element's parent for nearby text
const parent = elem.parentElement;
if (parent) {{
const parentText = parent.textContent;
const lines = parentText.split('\n');
for (let i = 0; i < lines.length; i++) {{
if (lines[i].includes(searchText) && i + 1 < lines.length) {{
const nextLine = lines[i + 1].trim();
if (nextLine && !nextLine.includes(searchText)) {{
return nextLine;
}}
}}
}}
}}
}}
}}
// If found in this container, stop searching
if (container !== document) {{
break;
}}
}}
return null;
}}
''')
if value and value.strip():
existing_value = dividend_data.get(field_name, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field_name] = value.strip()
field_found = True
if debug:
print(f"DEBUG: Found {field_name} via JS search with term '{search_term}': {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring JS search value: {value})")
break
except Exception as e:
if debug:
print(f"DEBUG: Could not find {field_name}: {e}")
continue
if debug:
print(f"DEBUG: Extracted dividend data: {dividend_data}")
return dividend_data
except Exception as e:
if debug:
print(f"DEBUG: Error extracting dividend data: {e}")
return dividend_data
async def extract(page, debug: bool = False) -> Dict[str, Any]:
"""Compatibility wrapper to call `extract_dividend_data`"""
return await extract_dividend_data(page, debug=debug)

View File

@@ -0,0 +1,452 @@
import time
from typing import Any, Dict, Optional
import logging
from ...core.config import load_config, get_playwright_url
from ...browser.auth import ensure_cookies
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok
from .morningstar import find_report, download_report_as_bytes
from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf
from .parser import parse as parse_pdf
from .scraper import extract_dividend_data
from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach
import re
def extract_company_name_from_title(page_title: str, ticker: str):
if not page_title:
return None
try:
title = (
page_title.replace(" | Charles Schwab", "")
.replace(" - Charles Schwab", "")
.replace("Stock Quote & Summary", "")
.replace("Stock Research", "")
.replace("Research", "")
.replace("- Research", "")
)
pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)"
match = re.match(pattern, title, re.IGNORECASE)
if match:
company_name = match.group(1).strip()
company_name = company_name.replace(" -", "").strip()
if len(company_name) > 1 and not company_name.isdigit():
return company_name
for separator in [" |", " -"]:
if separator in title:
potential_name = title.split(separator)[0].strip()
if potential_name.upper() != ticker.upper() and len(potential_name) > 1:
return potential_name
return None
except Exception:
return None
async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
"""Get Phase 1 enhanced equity data for a ticker.
Extracts:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
Args:
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
Envelope containing EquityPhase1Data or error
"""
ticker = ticker.upper()
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug(f"Starting get_equity_phase1_data for {ticker}")
# Session management
cookies = await ensure_cookies()
if not cookies:
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
# Browser orchestration
context = None
page = None
p, browser = await connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Navigate to stock research page
timeout = 30000 if debug else 45000
success = await goto_with_auth_check(
page,
context,
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
debug=debug,
timeout=timeout,
)
if not success:
return fail(
"Authentication failed while navigating to research page",
ErrorType.AUTHENTICATION,
retryable=True,
)
# Validate ticker by checking for stock page content
if debug:
logger.debug(f"Current page URL: {page.url}")
try:
# Wait for stock-specific content to appear
await page.wait_for_selector(
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
timeout=10000,
state='visible'
)
except Exception as wait_err:
if debug:
logger.debug(f"Timeout waiting for stock content: {wait_err}")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Validate content
try:
has_valid_content = await page.evaluate('''
() => {
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return true;
}
const morningstarSection = document.querySelector('#morningstar-section');
if (morningstarSection) {
return true;
}
return false;
}
''')
if not has_valid_content:
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
except Exception as e:
logger.debug(f"Error checking for valid content: {e}")
return fail(
f"Invalid ticker: {ticker}. Unable to validate ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Extract Phase 1 data using improved DOM scraping
# Note: API approach failed due to CORS restrictions
phase1_data = await extract_phase1_data(page, debug=debug)
return ok(phase1_data)
finally:
try:
if page is not None:
await page.close()
except Exception:
pass
try:
if context is not None:
await context.close()
except Exception:
pass
for handle in (browser,):
try:
if handle is not None:
await handle.close()
except Exception:
pass
try:
if p is not None:
await p.stop()
except Exception:
pass
async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
ticker = ticker.upper()
ensure_cache_dir()
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug(f"Starting get_morningstar_data for {ticker}")
# Session management
cookies = await ensure_cookies()
if not cookies:
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
# Browser orchestration
context = None
page = None
p, browser = await connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Use shared auth-aware navigation helper for consistency
# Use shorter timeout for tests to speed up execution
timeout = 30000 if debug else 45000
success = await goto_with_auth_check(
page,
context,
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
debug=debug,
timeout=timeout,
)
if not success:
return fail(
"Authentication failed while navigating to research page",
ErrorType.AUTHENTICATION,
retryable=True,
)
# Validate ticker by checking for stock page content
# Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid
if debug:
logger.debug(f"Current page URL: {page.url}")
# Wait for page content to load - Schwab's research page loads asynchronously
# Give it time to populate the DOM before validation
try:
# Wait for either company name or Morningstar section to appear
# This indicates the page has loaded stock-specific content
await page.wait_for_selector(
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
timeout=10000,
state='visible'
)
except Exception as wait_err:
# If neither selector appears after 10 seconds, likely an invalid ticker
if debug:
logger.debug(f"Timeout waiting for stock content: {wait_err}")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Additional validation: check if we have valid stock page content
try:
has_valid_content = await page.evaluate('''
() => {
// Look for company name span (valid stock pages have this)
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return true;
}
// Look for Morningstar section (valid stock pages have this)
const morningstarSection = document.querySelector('#morningstar-section');
if (morningstarSection) {
return true;
}
// Look for company profile description (valid stock pages have this)
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) {
return true;
}
// Look for any stock-related content
const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]');
if (stockContent) {
return true;
}
return false;
}
''')
if debug:
logger.debug(f"Valid stock content detected: {has_valid_content}")
if not has_valid_content:
if debug:
logger.debug(f"Invalid ticker detected - no stock content found")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
except Exception as e:
logger.debug(f"Error checking for valid content: {e}")
# If we can't check, assume invalid and return error
return fail(
f"Invalid ticker: {ticker}. Unable to validate ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Company name - extract from page elements
company_name = None
try:
# Strategy 1: Extract from company name span element
company_name = await page.evaluate('''
() => {
// Look for company name in title span
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return nameSpan.textContent.trim();
}
// Fallback: Extract from company profile description
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
if (profileText && profileText.textContent) {
const text = profileText.textContent.trim();
// Extract company name before " designs" or " is" or " provides"
const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i);
if (match) {
return match[1].trim();
}
}
return null;
}
''')
if debug and company_name:
logger.debug(f"Extracted company name: {company_name}")
except Exception as e:
logger.debug(f"Company name extraction error: {e}")
# Morningstar section wait
try:
await page.wait_for_selector('#morningstar-section', timeout=30000)
except Exception:
logger.debug("#morningstar-section not found within timeout")
# Dividends
try:
dividend_data = await extract_dividend_data(page, debug=debug)
except Exception as exc:
logger.debug(f"Dividend extraction error: {exc}")
dividend_data = {}
# Find report and download/cache
report_url, report_date = await find_report(page, debug=debug)
data: Dict[str, Any] = {}
if report_date:
data["Morningstar Equity Report Date"] = report_date.strip()
if report_url:
# Only store actual URL, not the __CLICK_TO_OPEN__ marker
if report_url != '__CLICK_TO_OPEN__':
data["Morningstar Equity Report URL"] = report_url
pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug)
else:
pdf_bytes = None
parsed_data: Dict[str, Any] = {}
if pdf_bytes:
if report_date:
from datetime import datetime
try:
dt = datetime.strptime(report_date, "%b %d, %Y")
formatted_date = dt.strftime("%m-%d-%Y")
except Exception:
formatted_date = report_date.replace(" ", "-")
else:
formatted_date = time.strftime("%m-%d-%Y")
write_cached_pdf(ticker, formatted_date, pdf_bytes)
try:
parsed_data = parse_pdf(pdf_bytes)
parsed_data["source"] = "live"
except Exception as exc:
logger.debug(f"PDF parsing failed: {exc}")
parsed_data = {"error": "Failed to parse Morningstar report"}
else:
cached = read_cached_pdf(ticker)
if cached:
try:
parsed_data = parse_pdf(cached)
parsed_data["source"] = "cache"
except Exception as exc:
logger.debug(f"Cached PDF parsing failed: {exc}")
parsed_data = {"error": "Failed to parse cached Morningstar report"}
else:
parsed_data = {"error": f"Failed to download and no cache available for {ticker}"}
morningstar = MorningstarData(
ticker=ticker,
company_name=company_name,
previous_dividend_payment=dividend_data.get("Previous Dividend Payment"),
previous_pay_date=dividend_data.get("Previous Pay Date"),
previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"),
frequency=dividend_data.get("Frequency"),
annual_dividend_rate=dividend_data.get("Annual Dividend Rate"),
annual_dividend_yield=dividend_data.get("Annual Dividend Yield"),
fair_value=parsed_data.get("Fair Value"),
economic_moat=parsed_data.get("Economic Moat"),
capital_allocation=parsed_data.get("Capital Allocation"),
rating=_safe_int(parsed_data.get("Morningstar Rating")),
one_star_price=parsed_data.get("1-Star Price"),
five_star_price=parsed_data.get("5-Star Price"),
assessment=parsed_data.get("Assessment"),
range_52_week=parsed_data.get("52-Week Range"),
dividend_yield=parsed_data.get("Dividend Yield"),
investment_style=parsed_data.get("Investment Style"),
report_url=data.get("Morningstar Equity Report URL"),
report_date=data.get("Morningstar Equity Report Date"),
source=parsed_data.get("source"),
)
if parsed_data.get("error"):
return fail(parsed_data["error"], ErrorType.PARSING, retryable=True)
return ok(morningstar)
finally:
try:
if page is not None:
await page.close()
except Exception:
pass
try:
if context is not None:
await context.close()
except Exception:
pass
for handle in (browser,):
try:
if handle is not None:
await handle.close()
except Exception:
pass
try:
if p is not None:
await p.stop()
except Exception:
pass
def _safe_int(value: Any) -> Optional[int]:
if value is None:
return None
try:
return int(str(value).strip())
except (TypeError, ValueError):
return None