Files
schwab-mcp-custom/schwab_scraper/features/equity/morningstar.py
b3nw 650ea2d087
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
Fix build: Bundle schwab_scraper source and use local dependencies
2026-04-24 01:50:20 +00:00

240 lines
9.3 KiB
Python

from typing import Optional, Tuple
import logging
async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]:
"""Locate the Morningstar Equity Report link and date on the stock page.
Uses multiple fallback strategies to handle Schwab website changes.
Returns:
Tuple of (url, date) where:
- url: The href attribute if it's a traditional link, or a special marker
'__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking
- date: The report date string if found
"""
logger = logging.getLogger(__name__)
# Strategy 1: Original selector
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
if await page.is_visible(report_link_selector):
if debug:
logger.debug("Found Morningstar report using original selector")
report_link_element = page.locator(report_link_selector)
await report_link_element.scroll_into_view_if_needed()
url = await report_link_element.get_attribute("href")
# Date element (escaped spaces)
date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)")
date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None
# Check if href is empty (modern web component using blob URLs)
if not url or url == '':
if debug:
logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click")
# Return a special marker to indicate we need to click the link to get the URL
return '__CLICK_TO_OPEN__', date_text
return url, date_text
# Strategy 2: Look for any link containing "morningstar" in research section
if debug:
logger.debug("Original selector failed, trying fallback selectors...")
fallback_selectors = [
"a.sr-report-link[href*='morningstar']",
"a[href*='morningstar'][href*='pdf']",
"#morningstar-section a.sr-report-link",
"div[id*='Morningstar'] a",
]
for selector in fallback_selectors:
try:
if await page.is_visible(selector, timeout=2000):
if debug:
logger.debug(f"Found Morningstar report using fallback selector: {selector}")
report_link_element = page.locator(selector).first
await report_link_element.scroll_into_view_if_needed()
url = await report_link_element.get_attribute("href")
# Try to find date with various selectors
date_text = None
date_selectors = [
r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)",
"sdps-date-time time span",
"time span",
]
for date_sel in date_selectors:
try:
date_locator = page.locator(date_sel)
if await date_locator.count() > 0:
date_text = (await date_locator.first.inner_text()).strip()
if date_text:
break
except:
continue
return url, date_text
except Exception as e:
if debug:
logger.debug(f"Fallback selector {selector} failed: {e}")
continue
# Strategy 3: Use JavaScript to search for Morningstar links
if debug:
logger.debug("All CSS selectors failed, trying JavaScript search...")
try:
result = await page.evaluate("""
() => {
// Look for any link containing 'morningstar' and 'pdf'
const links = Array.from(document.querySelectorAll('a[href]'));
const morningstarLink = links.find(link =>
link.href.toLowerCase().includes('morningstar') &&
link.href.toLowerCase().includes('pdf')
);
if (morningstarLink) {
// Try to find associated date
let dateText = null;
const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement;
if (parent) {
const timeElement = parent.querySelector('time');
if (timeElement) {
dateText = timeElement.textContent.trim();
}
}
return {
url: morningstarLink.href,
date: dateText
};
}
return null;
}
""")
if result and result.get('url'):
if debug:
logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}")
return result['url'], result.get('date')
except Exception as e:
if debug:
logger.debug(f"JavaScript search failed: {e}")
# No report found
if debug:
logger.debug("No Morningstar report link found using any strategy")
# Capture page state for debugging
try:
await page.screenshot(path="debug_morningstar_not_found.png", full_page=True)
logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png")
# Log available elements for debugging
page_info = await page.evaluate("""
() => {
return {
hasMorningstarSection: !!document.querySelector('#morningstar-section'),
hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'),
morningstarLinks: Array.from(document.querySelectorAll('a[href]'))
.filter(a => a.href.toLowerCase().includes('morningstar'))
.length,
allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length
}
}
""")
logger.debug(f"Page state: {page_info}")
except Exception as e:
logger.debug(f"Failed to capture debug info: {e}")
return None, None
async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]:
"""Open the PDF in a new page and return bytes via data URL conversion.
Args:
page: The current Playwright page
url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs
debug: Enable debug logging
Returns:
PDF bytes if successful, None otherwise
"""
logger = logging.getLogger(__name__)
if not url:
return None
# Handle blob URL case (modern web component)
if url == '__CLICK_TO_OPEN__':
if debug:
logger.debug("Handling blob URL - clicking link to open PDF")
# Click the Morningstar report link to open the PDF
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
try:
# Wait for new page to open after clicking
new_page_promise = page.context.wait_for_event("page", timeout=15000)
await page.click(report_link_selector)
new_page = await new_page_promise
if debug:
logger.debug(f"New page opened with URL: {new_page.url}")
# Wait for PDF to load
await new_page.wait_for_load_state('load', timeout=10000)
# The PDF is now loaded as a blob URL - extract it
blob_url = new_page.url
except Exception as e:
if debug:
logger.debug(f"Error clicking link to open PDF: {e}")
return None
else:
# Traditional URL case
if debug:
logger.debug(f"Opening PDF from traditional URL: {url}")
new_page_promise = page.context.wait_for_event("page")
await page.evaluate("url => window.open(url, '_blank')", url)
new_page = await new_page_promise
await new_page.wait_for_load_state('load')
blob_url = url
# Fetch and convert to Base64 in browser context
try:
pdf_base64 = await new_page.evaluate(
"""
async (url) => {
const response = await fetch(url);
const blob = await response.blob();
return await new Promise((resolve) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result.split(',')[1]);
reader.readAsDataURL(blob);
});
}
""",
blob_url,
)
await new_page.close()
if not pdf_base64:
return None
import base64
return base64.b64decode(pdf_base64)
except Exception as e:
if debug:
logger.debug(f"Error extracting PDF bytes: {e}")
try:
await new_page.close()
except:
pass
return None