Fix build: Bundle schwab_scraper source and use local dependencies

2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions
--- a/schwab_scraper/features/equity/morningstar.py
+++ b/schwab_scraper/features/equity/morningstar.py
@@ -0,0 +1,239 @@
+from typing import Optional, Tuple
+import logging
+
+
+async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]:
+    """Locate the Morningstar Equity Report link and date on the stock page.
+    
+    Uses multiple fallback strategies to handle Schwab website changes.
+    
+    Returns:
+        Tuple of (url, date) where:
+        - url: The href attribute if it's a traditional link, or a special marker
+               '__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking
+        - date: The report date string if found
+    """
+    logger = logging.getLogger(__name__)
+    
+    # Strategy 1: Original selector
+    report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
+    if await page.is_visible(report_link_selector):
+        if debug:
+            logger.debug("Found Morningstar report using original selector")
+        report_link_element = page.locator(report_link_selector)
+        await report_link_element.scroll_into_view_if_needed()
+        url = await report_link_element.get_attribute("href")
+        
+        # Date element (escaped spaces)
+        date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)")
+        date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None
+        
+        # Check if href is empty (modern web component using blob URLs)
+        if not url or url == '':
+            if debug:
+                logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click")
+            # Return a special marker to indicate we need to click the link to get the URL
+            return '__CLICK_TO_OPEN__', date_text
+        
+        return url, date_text
+    
+    # Strategy 2: Look for any link containing "morningstar" in research section
+    if debug:
+        logger.debug("Original selector failed, trying fallback selectors...")
+    
+    fallback_selectors = [
+        "a.sr-report-link[href*='morningstar']",
+        "a[href*='morningstar'][href*='pdf']",
+        "#morningstar-section a.sr-report-link",
+        "div[id*='Morningstar'] a",
+    ]
+    
+    for selector in fallback_selectors:
+        try:
+            if await page.is_visible(selector, timeout=2000):
+                if debug:
+                    logger.debug(f"Found Morningstar report using fallback selector: {selector}")
+                report_link_element = page.locator(selector).first
+                await report_link_element.scroll_into_view_if_needed()
+                url = await report_link_element.get_attribute("href")
+                
+                # Try to find date with various selectors
+                date_text = None
+                date_selectors = [
+                    r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)",
+                    "sdps-date-time time span",
+                    "time span",
+                ]
+                for date_sel in date_selectors:
+                    try:
+                        date_locator = page.locator(date_sel)
+                        if await date_locator.count() > 0:
+                            date_text = (await date_locator.first.inner_text()).strip()
+                            if date_text:
+                                break
+                    except:
+                        continue
+                
+                return url, date_text
+        except Exception as e:
+            if debug:
+                logger.debug(f"Fallback selector {selector} failed: {e}")
+            continue
+    
+    # Strategy 3: Use JavaScript to search for Morningstar links
+    if debug:
+        logger.debug("All CSS selectors failed, trying JavaScript search...")
+    
+    try:
+        result = await page.evaluate("""
+            () => {
+                // Look for any link containing 'morningstar' and 'pdf'
+                const links = Array.from(document.querySelectorAll('a[href]'));
+                const morningstarLink = links.find(link => 
+                    link.href.toLowerCase().includes('morningstar') && 
+                    link.href.toLowerCase().includes('pdf')
+                );
+                
+                if (morningstarLink) {
+                    // Try to find associated date
+                    let dateText = null;
+                    const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement;
+                    if (parent) {
+                        const timeElement = parent.querySelector('time');
+                        if (timeElement) {
+                            dateText = timeElement.textContent.trim();
+                        }
+                    }
+                    
+                    return {
+                        url: morningstarLink.href,
+                        date: dateText
+                    };
+                }
+                
+                return null;
+            }
+        """)
+        
+        if result and result.get('url'):
+            if debug:
+                logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}")
+            return result['url'], result.get('date')
+    except Exception as e:
+        if debug:
+            logger.debug(f"JavaScript search failed: {e}")
+    
+    # No report found
+    if debug:
+        logger.debug("No Morningstar report link found using any strategy")
+        # Capture page state for debugging
+        try:
+            await page.screenshot(path="debug_morningstar_not_found.png", full_page=True)
+            logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png")
+            
+            # Log available elements for debugging
+            page_info = await page.evaluate("""
+                () => {
+                    return {
+                        hasMorningstarSection: !!document.querySelector('#morningstar-section'),
+                        hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'),
+                        morningstarLinks: Array.from(document.querySelectorAll('a[href]'))
+                            .filter(a => a.href.toLowerCase().includes('morningstar'))
+                            .length,
+                        allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length
+                    }
+                }
+            """)
+            logger.debug(f"Page state: {page_info}")
+        except Exception as e:
+            logger.debug(f"Failed to capture debug info: {e}")
+    
+    return None, None
+
+
+async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]:
+    """Open the PDF in a new page and return bytes via data URL conversion.
+    
+    Args:
+        page: The current Playwright page
+        url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs
+        debug: Enable debug logging
+    
+    Returns:
+        PDF bytes if successful, None otherwise
+    """
+    logger = logging.getLogger(__name__)
+    
+    if not url:
+        return None
+    
+    # Handle blob URL case (modern web component)
+    if url == '__CLICK_TO_OPEN__':
+        if debug:
+            logger.debug("Handling blob URL - clicking link to open PDF")
+        
+        # Click the Morningstar report link to open the PDF
+        report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
+        
+        try:
+            # Wait for new page to open after clicking
+            new_page_promise = page.context.wait_for_event("page", timeout=15000)
+            await page.click(report_link_selector)
+            new_page = await new_page_promise
+            
+            if debug:
+                logger.debug(f"New page opened with URL: {new_page.url}")
+            
+            # Wait for PDF to load
+            await new_page.wait_for_load_state('load', timeout=10000)
+            
+            # The PDF is now loaded as a blob URL - extract it
+            blob_url = new_page.url
+            
+        except Exception as e:
+            if debug:
+                logger.debug(f"Error clicking link to open PDF: {e}")
+            return None
+    else:
+        # Traditional URL case
+        if debug:
+            logger.debug(f"Opening PDF from traditional URL: {url}")
+        
+        new_page_promise = page.context.wait_for_event("page")
+        await page.evaluate("url => window.open(url, '_blank')", url)
+        new_page = await new_page_promise
+        await new_page.wait_for_load_state('load')
+        blob_url = url
+
+    # Fetch and convert to Base64 in browser context
+    try:
+        pdf_base64 = await new_page.evaluate(
+            """
+            async (url) => {
+                const response = await fetch(url);
+                const blob = await response.blob();
+                return await new Promise((resolve) => {
+                    const reader = new FileReader();
+                    reader.onloadend = () => resolve(reader.result.split(',')[1]);
+                    reader.readAsDataURL(blob);
+                });
+            }
+            """,
+            blob_url,
+        )
+        await new_page.close()
+        
+        if not pdf_base64:
+            return None
+        
+        import base64
+        return base64.b64decode(pdf_base64)
+        
+    except Exception as e:
+        if debug:
+            logger.debug(f"Error extracting PDF bytes: {e}")
+        try:
+            await new_page.close()
+        except:
+            pass
+        return None