Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
This commit is contained in:
452
schwab_scraper/features/equity/service.py
Normal file
452
schwab_scraper/features/equity/service.py
Normal file
@@ -0,0 +1,452 @@
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
import logging
|
||||
from ...core.config import load_config, get_playwright_url
|
||||
from ...browser.auth import ensure_cookies
|
||||
from ...browser.client import connect, new_context, new_page
|
||||
from ...browser.navigation import goto_with_auth_check
|
||||
from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok
|
||||
from .morningstar import find_report, download_report_as_bytes
|
||||
from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf
|
||||
from .parser import parse as parse_pdf
|
||||
from .scraper import extract_dividend_data
|
||||
from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach
|
||||
import re
|
||||
|
||||
def extract_company_name_from_title(page_title: str, ticker: str):
|
||||
if not page_title:
|
||||
return None
|
||||
try:
|
||||
title = (
|
||||
page_title.replace(" | Charles Schwab", "")
|
||||
.replace(" - Charles Schwab", "")
|
||||
.replace("Stock Quote & Summary", "")
|
||||
.replace("Stock Research", "")
|
||||
.replace("Research", "")
|
||||
.replace("- Research", "")
|
||||
)
|
||||
pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)"
|
||||
match = re.match(pattern, title, re.IGNORECASE)
|
||||
if match:
|
||||
company_name = match.group(1).strip()
|
||||
company_name = company_name.replace(" -", "").strip()
|
||||
if len(company_name) > 1 and not company_name.isdigit():
|
||||
return company_name
|
||||
for separator in [" |", " -"]:
|
||||
if separator in title:
|
||||
potential_name = title.split(separator)[0].strip()
|
||||
if potential_name.upper() != ticker.upper() and len(potential_name) > 1:
|
||||
return potential_name
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
|
||||
"""Get Phase 1 enhanced equity data for a ticker.
|
||||
|
||||
Extracts:
|
||||
- Quote/Price Data (symbol bar)
|
||||
- Enhanced Dividend Information (forward-looking dates)
|
||||
- Core Earnings Metrics (EPS, forecasts)
|
||||
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
|
||||
- Calculated Metrics (payout ratio)
|
||||
|
||||
Args:
|
||||
ticker: Stock ticker symbol
|
||||
debug: Enable debug logging
|
||||
|
||||
Returns:
|
||||
Envelope containing EquityPhase1Data or error
|
||||
"""
|
||||
ticker = ticker.upper()
|
||||
logger = logging.getLogger(__name__)
|
||||
if debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.debug(f"Starting get_equity_phase1_data for {ticker}")
|
||||
|
||||
# Session management
|
||||
cookies = await ensure_cookies()
|
||||
if not cookies:
|
||||
return fail(
|
||||
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
|
||||
ErrorType.AUTHENTICATION,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
config = load_config()
|
||||
playwright_url = get_playwright_url(config)
|
||||
|
||||
# Browser orchestration
|
||||
context = None
|
||||
page = None
|
||||
p, browser = await connect(playwright_url)
|
||||
try:
|
||||
context = await new_context(browser, cookies=cookies)
|
||||
page = await new_page(context)
|
||||
|
||||
# Navigate to stock research page
|
||||
timeout = 30000 if debug else 45000
|
||||
success = await goto_with_auth_check(
|
||||
page,
|
||||
context,
|
||||
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
|
||||
debug=debug,
|
||||
timeout=timeout,
|
||||
)
|
||||
if not success:
|
||||
return fail(
|
||||
"Authentication failed while navigating to research page",
|
||||
ErrorType.AUTHENTICATION,
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
# Validate ticker by checking for stock page content
|
||||
if debug:
|
||||
logger.debug(f"Current page URL: {page.url}")
|
||||
|
||||
try:
|
||||
# Wait for stock-specific content to appear
|
||||
await page.wait_for_selector(
|
||||
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
|
||||
timeout=10000,
|
||||
state='visible'
|
||||
)
|
||||
except Exception as wait_err:
|
||||
if debug:
|
||||
logger.debug(f"Timeout waiting for stock content: {wait_err}")
|
||||
return fail(
|
||||
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
||||
ErrorType.VALIDATION,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
# Validate content
|
||||
try:
|
||||
has_valid_content = await page.evaluate('''
|
||||
() => {
|
||||
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
|
||||
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
|
||||
return true;
|
||||
}
|
||||
const morningstarSection = document.querySelector('#morningstar-section');
|
||||
if (morningstarSection) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
''')
|
||||
|
||||
if not has_valid_content:
|
||||
return fail(
|
||||
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
||||
ErrorType.VALIDATION,
|
||||
retryable=False,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Error checking for valid content: {e}")
|
||||
return fail(
|
||||
f"Invalid ticker: {ticker}. Unable to validate ticker.",
|
||||
ErrorType.VALIDATION,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
# Extract Phase 1 data using improved DOM scraping
|
||||
# Note: API approach failed due to CORS restrictions
|
||||
phase1_data = await extract_phase1_data(page, debug=debug)
|
||||
|
||||
return ok(phase1_data)
|
||||
|
||||
finally:
|
||||
try:
|
||||
if page is not None:
|
||||
await page.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if context is not None:
|
||||
await context.close()
|
||||
except Exception:
|
||||
pass
|
||||
for handle in (browser,):
|
||||
try:
|
||||
if handle is not None:
|
||||
await handle.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if p is not None:
|
||||
await p.stop()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
|
||||
ticker = ticker.upper()
|
||||
ensure_cache_dir()
|
||||
logger = logging.getLogger(__name__)
|
||||
if debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.debug(f"Starting get_morningstar_data for {ticker}")
|
||||
|
||||
# Session management
|
||||
cookies = await ensure_cookies()
|
||||
if not cookies:
|
||||
return fail(
|
||||
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
|
||||
ErrorType.AUTHENTICATION,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
config = load_config()
|
||||
playwright_url = get_playwright_url(config)
|
||||
|
||||
# Browser orchestration
|
||||
context = None
|
||||
page = None
|
||||
p, browser = await connect(playwright_url)
|
||||
try:
|
||||
context = await new_context(browser, cookies=cookies)
|
||||
page = await new_page(context)
|
||||
|
||||
# Use shared auth-aware navigation helper for consistency
|
||||
# Use shorter timeout for tests to speed up execution
|
||||
timeout = 30000 if debug else 45000
|
||||
success = await goto_with_auth_check(
|
||||
page,
|
||||
context,
|
||||
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
|
||||
debug=debug,
|
||||
timeout=timeout,
|
||||
)
|
||||
if not success:
|
||||
return fail(
|
||||
"Authentication failed while navigating to research page",
|
||||
ErrorType.AUTHENTICATION,
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
# Validate ticker by checking for stock page content
|
||||
# Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid
|
||||
if debug:
|
||||
logger.debug(f"Current page URL: {page.url}")
|
||||
|
||||
# Wait for page content to load - Schwab's research page loads asynchronously
|
||||
# Give it time to populate the DOM before validation
|
||||
try:
|
||||
# Wait for either company name or Morningstar section to appear
|
||||
# This indicates the page has loaded stock-specific content
|
||||
await page.wait_for_selector(
|
||||
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
|
||||
timeout=10000,
|
||||
state='visible'
|
||||
)
|
||||
except Exception as wait_err:
|
||||
# If neither selector appears after 10 seconds, likely an invalid ticker
|
||||
if debug:
|
||||
logger.debug(f"Timeout waiting for stock content: {wait_err}")
|
||||
return fail(
|
||||
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
||||
ErrorType.VALIDATION,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
# Additional validation: check if we have valid stock page content
|
||||
try:
|
||||
has_valid_content = await page.evaluate('''
|
||||
() => {
|
||||
// Look for company name span (valid stock pages have this)
|
||||
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
|
||||
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Look for Morningstar section (valid stock pages have this)
|
||||
const morningstarSection = document.querySelector('#morningstar-section');
|
||||
if (morningstarSection) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Look for company profile description (valid stock pages have this)
|
||||
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
|
||||
if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Look for any stock-related content
|
||||
const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]');
|
||||
if (stockContent) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
''')
|
||||
|
||||
if debug:
|
||||
logger.debug(f"Valid stock content detected: {has_valid_content}")
|
||||
|
||||
if not has_valid_content:
|
||||
if debug:
|
||||
logger.debug(f"Invalid ticker detected - no stock content found")
|
||||
return fail(
|
||||
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
|
||||
ErrorType.VALIDATION,
|
||||
retryable=False,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Error checking for valid content: {e}")
|
||||
# If we can't check, assume invalid and return error
|
||||
return fail(
|
||||
f"Invalid ticker: {ticker}. Unable to validate ticker.",
|
||||
ErrorType.VALIDATION,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
# Company name - extract from page elements
|
||||
company_name = None
|
||||
try:
|
||||
# Strategy 1: Extract from company name span element
|
||||
company_name = await page.evaluate('''
|
||||
() => {
|
||||
// Look for company name in title span
|
||||
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
|
||||
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
|
||||
return nameSpan.textContent.trim();
|
||||
}
|
||||
|
||||
// Fallback: Extract from company profile description
|
||||
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
|
||||
if (profileText && profileText.textContent) {
|
||||
const text = profileText.textContent.trim();
|
||||
// Extract company name before " designs" or " is" or " provides"
|
||||
const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i);
|
||||
if (match) {
|
||||
return match[1].trim();
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
''')
|
||||
if debug and company_name:
|
||||
logger.debug(f"Extracted company name: {company_name}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Company name extraction error: {e}")
|
||||
|
||||
# Morningstar section wait
|
||||
try:
|
||||
await page.wait_for_selector('#morningstar-section', timeout=30000)
|
||||
except Exception:
|
||||
logger.debug("#morningstar-section not found within timeout")
|
||||
|
||||
# Dividends
|
||||
try:
|
||||
dividend_data = await extract_dividend_data(page, debug=debug)
|
||||
except Exception as exc:
|
||||
logger.debug(f"Dividend extraction error: {exc}")
|
||||
dividend_data = {}
|
||||
|
||||
# Find report and download/cache
|
||||
report_url, report_date = await find_report(page, debug=debug)
|
||||
data: Dict[str, Any] = {}
|
||||
if report_date:
|
||||
data["Morningstar Equity Report Date"] = report_date.strip()
|
||||
if report_url:
|
||||
# Only store actual URL, not the __CLICK_TO_OPEN__ marker
|
||||
if report_url != '__CLICK_TO_OPEN__':
|
||||
data["Morningstar Equity Report URL"] = report_url
|
||||
pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug)
|
||||
else:
|
||||
pdf_bytes = None
|
||||
|
||||
parsed_data: Dict[str, Any] = {}
|
||||
if pdf_bytes:
|
||||
if report_date:
|
||||
from datetime import datetime
|
||||
try:
|
||||
dt = datetime.strptime(report_date, "%b %d, %Y")
|
||||
formatted_date = dt.strftime("%m-%d-%Y")
|
||||
except Exception:
|
||||
formatted_date = report_date.replace(" ", "-")
|
||||
else:
|
||||
formatted_date = time.strftime("%m-%d-%Y")
|
||||
write_cached_pdf(ticker, formatted_date, pdf_bytes)
|
||||
try:
|
||||
parsed_data = parse_pdf(pdf_bytes)
|
||||
parsed_data["source"] = "live"
|
||||
except Exception as exc:
|
||||
logger.debug(f"PDF parsing failed: {exc}")
|
||||
parsed_data = {"error": "Failed to parse Morningstar report"}
|
||||
else:
|
||||
cached = read_cached_pdf(ticker)
|
||||
if cached:
|
||||
try:
|
||||
parsed_data = parse_pdf(cached)
|
||||
parsed_data["source"] = "cache"
|
||||
except Exception as exc:
|
||||
logger.debug(f"Cached PDF parsing failed: {exc}")
|
||||
parsed_data = {"error": "Failed to parse cached Morningstar report"}
|
||||
else:
|
||||
parsed_data = {"error": f"Failed to download and no cache available for {ticker}"}
|
||||
|
||||
morningstar = MorningstarData(
|
||||
ticker=ticker,
|
||||
company_name=company_name,
|
||||
previous_dividend_payment=dividend_data.get("Previous Dividend Payment"),
|
||||
previous_pay_date=dividend_data.get("Previous Pay Date"),
|
||||
previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"),
|
||||
frequency=dividend_data.get("Frequency"),
|
||||
annual_dividend_rate=dividend_data.get("Annual Dividend Rate"),
|
||||
annual_dividend_yield=dividend_data.get("Annual Dividend Yield"),
|
||||
fair_value=parsed_data.get("Fair Value"),
|
||||
economic_moat=parsed_data.get("Economic Moat"),
|
||||
capital_allocation=parsed_data.get("Capital Allocation"),
|
||||
rating=_safe_int(parsed_data.get("Morningstar Rating")),
|
||||
one_star_price=parsed_data.get("1-Star Price"),
|
||||
five_star_price=parsed_data.get("5-Star Price"),
|
||||
assessment=parsed_data.get("Assessment"),
|
||||
range_52_week=parsed_data.get("52-Week Range"),
|
||||
dividend_yield=parsed_data.get("Dividend Yield"),
|
||||
investment_style=parsed_data.get("Investment Style"),
|
||||
report_url=data.get("Morningstar Equity Report URL"),
|
||||
report_date=data.get("Morningstar Equity Report Date"),
|
||||
source=parsed_data.get("source"),
|
||||
)
|
||||
|
||||
if parsed_data.get("error"):
|
||||
return fail(parsed_data["error"], ErrorType.PARSING, retryable=True)
|
||||
|
||||
return ok(morningstar)
|
||||
|
||||
finally:
|
||||
try:
|
||||
if page is not None:
|
||||
await page.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if context is not None:
|
||||
await context.close()
|
||||
except Exception:
|
||||
pass
|
||||
for handle in (browser,):
|
||||
try:
|
||||
if handle is not None:
|
||||
await handle.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if p is not None:
|
||||
await p.stop()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _safe_int(value: Any) -> Optional[int]:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return int(str(value).strip())
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
Reference in New Issue
Block a user