Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
This commit is contained in:
239
schwab_scraper/features/equity/morningstar.py
Normal file
239
schwab_scraper/features/equity/morningstar.py
Normal file
@@ -0,0 +1,239 @@
|
||||
from typing import Optional, Tuple
|
||||
import logging
|
||||
|
||||
|
||||
async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Locate the Morningstar Equity Report link and date on the stock page.
|
||||
|
||||
Uses multiple fallback strategies to handle Schwab website changes.
|
||||
|
||||
Returns:
|
||||
Tuple of (url, date) where:
|
||||
- url: The href attribute if it's a traditional link, or a special marker
|
||||
'__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking
|
||||
- date: The report date string if found
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Strategy 1: Original selector
|
||||
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
|
||||
if await page.is_visible(report_link_selector):
|
||||
if debug:
|
||||
logger.debug("Found Morningstar report using original selector")
|
||||
report_link_element = page.locator(report_link_selector)
|
||||
await report_link_element.scroll_into_view_if_needed()
|
||||
url = await report_link_element.get_attribute("href")
|
||||
|
||||
# Date element (escaped spaces)
|
||||
date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)")
|
||||
date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None
|
||||
|
||||
# Check if href is empty (modern web component using blob URLs)
|
||||
if not url or url == '':
|
||||
if debug:
|
||||
logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click")
|
||||
# Return a special marker to indicate we need to click the link to get the URL
|
||||
return '__CLICK_TO_OPEN__', date_text
|
||||
|
||||
return url, date_text
|
||||
|
||||
# Strategy 2: Look for any link containing "morningstar" in research section
|
||||
if debug:
|
||||
logger.debug("Original selector failed, trying fallback selectors...")
|
||||
|
||||
fallback_selectors = [
|
||||
"a.sr-report-link[href*='morningstar']",
|
||||
"a[href*='morningstar'][href*='pdf']",
|
||||
"#morningstar-section a.sr-report-link",
|
||||
"div[id*='Morningstar'] a",
|
||||
]
|
||||
|
||||
for selector in fallback_selectors:
|
||||
try:
|
||||
if await page.is_visible(selector, timeout=2000):
|
||||
if debug:
|
||||
logger.debug(f"Found Morningstar report using fallback selector: {selector}")
|
||||
report_link_element = page.locator(selector).first
|
||||
await report_link_element.scroll_into_view_if_needed()
|
||||
url = await report_link_element.get_attribute("href")
|
||||
|
||||
# Try to find date with various selectors
|
||||
date_text = None
|
||||
date_selectors = [
|
||||
r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)",
|
||||
"sdps-date-time time span",
|
||||
"time span",
|
||||
]
|
||||
for date_sel in date_selectors:
|
||||
try:
|
||||
date_locator = page.locator(date_sel)
|
||||
if await date_locator.count() > 0:
|
||||
date_text = (await date_locator.first.inner_text()).strip()
|
||||
if date_text:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
return url, date_text
|
||||
except Exception as e:
|
||||
if debug:
|
||||
logger.debug(f"Fallback selector {selector} failed: {e}")
|
||||
continue
|
||||
|
||||
# Strategy 3: Use JavaScript to search for Morningstar links
|
||||
if debug:
|
||||
logger.debug("All CSS selectors failed, trying JavaScript search...")
|
||||
|
||||
try:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
// Look for any link containing 'morningstar' and 'pdf'
|
||||
const links = Array.from(document.querySelectorAll('a[href]'));
|
||||
const morningstarLink = links.find(link =>
|
||||
link.href.toLowerCase().includes('morningstar') &&
|
||||
link.href.toLowerCase().includes('pdf')
|
||||
);
|
||||
|
||||
if (morningstarLink) {
|
||||
// Try to find associated date
|
||||
let dateText = null;
|
||||
const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement;
|
||||
if (parent) {
|
||||
const timeElement = parent.querySelector('time');
|
||||
if (timeElement) {
|
||||
dateText = timeElement.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
url: morningstarLink.href,
|
||||
date: dateText
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
""")
|
||||
|
||||
if result and result.get('url'):
|
||||
if debug:
|
||||
logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}")
|
||||
return result['url'], result.get('date')
|
||||
except Exception as e:
|
||||
if debug:
|
||||
logger.debug(f"JavaScript search failed: {e}")
|
||||
|
||||
# No report found
|
||||
if debug:
|
||||
logger.debug("No Morningstar report link found using any strategy")
|
||||
# Capture page state for debugging
|
||||
try:
|
||||
await page.screenshot(path="debug_morningstar_not_found.png", full_page=True)
|
||||
logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png")
|
||||
|
||||
# Log available elements for debugging
|
||||
page_info = await page.evaluate("""
|
||||
() => {
|
||||
return {
|
||||
hasMorningstarSection: !!document.querySelector('#morningstar-section'),
|
||||
hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'),
|
||||
morningstarLinks: Array.from(document.querySelectorAll('a[href]'))
|
||||
.filter(a => a.href.toLowerCase().includes('morningstar'))
|
||||
.length,
|
||||
allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length
|
||||
}
|
||||
}
|
||||
""")
|
||||
logger.debug(f"Page state: {page_info}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to capture debug info: {e}")
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]:
|
||||
"""Open the PDF in a new page and return bytes via data URL conversion.
|
||||
|
||||
Args:
|
||||
page: The current Playwright page
|
||||
url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs
|
||||
debug: Enable debug logging
|
||||
|
||||
Returns:
|
||||
PDF bytes if successful, None otherwise
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Handle blob URL case (modern web component)
|
||||
if url == '__CLICK_TO_OPEN__':
|
||||
if debug:
|
||||
logger.debug("Handling blob URL - clicking link to open PDF")
|
||||
|
||||
# Click the Morningstar report link to open the PDF
|
||||
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
|
||||
|
||||
try:
|
||||
# Wait for new page to open after clicking
|
||||
new_page_promise = page.context.wait_for_event("page", timeout=15000)
|
||||
await page.click(report_link_selector)
|
||||
new_page = await new_page_promise
|
||||
|
||||
if debug:
|
||||
logger.debug(f"New page opened with URL: {new_page.url}")
|
||||
|
||||
# Wait for PDF to load
|
||||
await new_page.wait_for_load_state('load', timeout=10000)
|
||||
|
||||
# The PDF is now loaded as a blob URL - extract it
|
||||
blob_url = new_page.url
|
||||
|
||||
except Exception as e:
|
||||
if debug:
|
||||
logger.debug(f"Error clicking link to open PDF: {e}")
|
||||
return None
|
||||
else:
|
||||
# Traditional URL case
|
||||
if debug:
|
||||
logger.debug(f"Opening PDF from traditional URL: {url}")
|
||||
|
||||
new_page_promise = page.context.wait_for_event("page")
|
||||
await page.evaluate("url => window.open(url, '_blank')", url)
|
||||
new_page = await new_page_promise
|
||||
await new_page.wait_for_load_state('load')
|
||||
blob_url = url
|
||||
|
||||
# Fetch and convert to Base64 in browser context
|
||||
try:
|
||||
pdf_base64 = await new_page.evaluate(
|
||||
"""
|
||||
async (url) => {
|
||||
const response = await fetch(url);
|
||||
const blob = await response.blob();
|
||||
return await new Promise((resolve) => {
|
||||
const reader = new FileReader();
|
||||
reader.onloadend = () => resolve(reader.result.split(',')[1]);
|
||||
reader.readAsDataURL(blob);
|
||||
});
|
||||
}
|
||||
""",
|
||||
blob_url,
|
||||
)
|
||||
await new_page.close()
|
||||
|
||||
if not pdf_base64:
|
||||
return None
|
||||
|
||||
import base64
|
||||
return base64.b64decode(pdf_base64)
|
||||
|
||||
except Exception as e:
|
||||
if debug:
|
||||
logger.debug(f"Error extracting PDF bytes: {e}")
|
||||
try:
|
||||
await new_page.close()
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
Reference in New Issue
Block a user