from typing import Optional, Tuple import logging async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]: """Locate the Morningstar Equity Report link and date on the stock page. Uses multiple fallback strategies to handle Schwab website changes. Returns: Tuple of (url, date) where: - url: The href attribute if it's a traditional link, or a special marker '__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking - date: The report date string if found """ logger = logging.getLogger(__name__) # Strategy 1: Original selector report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link" if await page.is_visible(report_link_selector): if debug: logger.debug("Found Morningstar report using original selector") report_link_element = page.locator(report_link_selector) await report_link_element.scroll_into_view_if_needed() url = await report_link_element.get_attribute("href") # Date element (escaped spaces) date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)") date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None # Check if href is empty (modern web component using blob URLs) if not url or url == '': if debug: logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click") # Return a special marker to indicate we need to click the link to get the URL return '__CLICK_TO_OPEN__', date_text return url, date_text # Strategy 2: Look for any link containing "morningstar" in research section if debug: logger.debug("Original selector failed, trying fallback selectors...") fallback_selectors = [ "a.sr-report-link[href*='morningstar']", "a[href*='morningstar'][href*='pdf']", "#morningstar-section a.sr-report-link", "div[id*='Morningstar'] a", ] for selector in fallback_selectors: try: if await page.is_visible(selector, timeout=2000): if debug: logger.debug(f"Found Morningstar report using fallback selector: {selector}") report_link_element = page.locator(selector).first await report_link_element.scroll_into_view_if_needed() url = await report_link_element.get_attribute("href") # Try to find date with various selectors date_text = None date_selectors = [ r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)", "sdps-date-time time span", "time span", ] for date_sel in date_selectors: try: date_locator = page.locator(date_sel) if await date_locator.count() > 0: date_text = (await date_locator.first.inner_text()).strip() if date_text: break except: continue return url, date_text except Exception as e: if debug: logger.debug(f"Fallback selector {selector} failed: {e}") continue # Strategy 3: Use JavaScript to search for Morningstar links if debug: logger.debug("All CSS selectors failed, trying JavaScript search...") try: result = await page.evaluate(""" () => { // Look for any link containing 'morningstar' and 'pdf' const links = Array.from(document.querySelectorAll('a[href]')); const morningstarLink = links.find(link => link.href.toLowerCase().includes('morningstar') && link.href.toLowerCase().includes('pdf') ); if (morningstarLink) { // Try to find associated date let dateText = null; const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement; if (parent) { const timeElement = parent.querySelector('time'); if (timeElement) { dateText = timeElement.textContent.trim(); } } return { url: morningstarLink.href, date: dateText }; } return null; } """) if result and result.get('url'): if debug: logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}") return result['url'], result.get('date') except Exception as e: if debug: logger.debug(f"JavaScript search failed: {e}") # No report found if debug: logger.debug("No Morningstar report link found using any strategy") # Capture page state for debugging try: await page.screenshot(path="debug_morningstar_not_found.png", full_page=True) logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png") # Log available elements for debugging page_info = await page.evaluate(""" () => { return { hasMorningstarSection: !!document.querySelector('#morningstar-section'), hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'), morningstarLinks: Array.from(document.querySelectorAll('a[href]')) .filter(a => a.href.toLowerCase().includes('morningstar')) .length, allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length } } """) logger.debug(f"Page state: {page_info}") except Exception as e: logger.debug(f"Failed to capture debug info: {e}") return None, None async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]: """Open the PDF in a new page and return bytes via data URL conversion. Args: page: The current Playwright page url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs debug: Enable debug logging Returns: PDF bytes if successful, None otherwise """ logger = logging.getLogger(__name__) if not url: return None # Handle blob URL case (modern web component) if url == '__CLICK_TO_OPEN__': if debug: logger.debug("Handling blob URL - clicking link to open PDF") # Click the Morningstar report link to open the PDF report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link" try: # Wait for new page to open after clicking new_page_promise = page.context.wait_for_event("page", timeout=15000) await page.click(report_link_selector) new_page = await new_page_promise if debug: logger.debug(f"New page opened with URL: {new_page.url}") # Wait for PDF to load await new_page.wait_for_load_state('load', timeout=10000) # The PDF is now loaded as a blob URL - extract it blob_url = new_page.url except Exception as e: if debug: logger.debug(f"Error clicking link to open PDF: {e}") return None else: # Traditional URL case if debug: logger.debug(f"Opening PDF from traditional URL: {url}") new_page_promise = page.context.wait_for_event("page") await page.evaluate("url => window.open(url, '_blank')", url) new_page = await new_page_promise await new_page.wait_for_load_state('load') blob_url = url # Fetch and convert to Base64 in browser context try: pdf_base64 = await new_page.evaluate( """ async (url) => { const response = await fetch(url); const blob = await response.blob(); return await new Promise((resolve) => { const reader = new FileReader(); reader.onloadend = () => resolve(reader.result.split(',')[1]); reader.readAsDataURL(blob); }); } """, blob_url, ) await new_page.close() if not pdf_base64: return None import base64 return base64.b64decode(pdf_base64) except Exception as e: if debug: logger.debug(f"Error extracting PDF bytes: {e}") try: await new_page.close() except: pass return None