Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s

This commit is contained in:
2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions

View File

@@ -0,0 +1,490 @@
"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL)
⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️
This module was an attempt to extract equity data by calling Schwab's REST APIs directly.
While the APIs exist and were discovered via HAR analysis, they are NOT accessible from
this scraper due to fundamental browser security limitations (CORS).
## Why This Approach Failed:
1. **CORS (Cross-Origin Resource Sharing) Restrictions**:
- Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins)
- Browser blocks cross-origin fetch() calls even from page.evaluate()
- Results in "TypeError: Failed to fetch"
2. **Authentication Complexity**:
- Direct HTTP (aiohttp) with cookies: 401/403 errors
- Playwright page.request.fetch(): 401 errors (separate context)
- Likely requires dynamic tokens beyond cookies
## Recommendation:
**Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated
sessions and extracts all Phase 1 fields without CORS limitations.
## API Endpoints (discovered but inaccessible):
- Quote: /api/is.ResearchExperience/v1/quote
- Dividends: /api/is.ResearchExperience/v1/events/dividends
- Earnings: /api/is.ResearchExperience/v1/events/earnings
- Share Profile: /api/is.ResearchExperience/v1/shareprofile
"""
from typing import Dict, Any, Optional, List
import logging
import uuid
import aiohttp
from playwright.async_api import Page
from ...core import (
QuoteData, EnhancedDividends, EarningsData,
CalculatedMetrics, EquityPhase1Data
)
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float."""
if value is None:
return None
try:
if isinstance(value, str):
# Remove % sign if present
value = value.replace('%', '').strip()
return float(value)
except (ValueError, TypeError):
return None
def _parse_market_cap(value: str) -> Optional[str]:
"""Parse market cap string like '$3.03T' or '$462.11B'."""
if not value:
return None
# Keep the formatted string as-is for readability
return value.strip()
def _parse_volume(value: Any) -> Optional[int]:
"""Parse volume value."""
if value is None:
return None
try:
return int(float(value))
except (ValueError, TypeError):
return None
def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData:
"""Parse quote API response into QuoteData object.
API Response Structure:
{
"reference": {
"symbol": "JNJ",
"companyName": "JOHNSON & JOHNSON",
"exchangeName": "NYSE"
},
"quote": {
"lastPrice": 193.155,
"netChange": 1.275,
"netChangePercent": 0.6644778,
"postMarketChange": 0.0,
"postMarketPercentChange": 0.0,
"tradeTime": "2025-10-22T17:06:42.008Z"
},
"regularQuote": {
"lastPrice": 193.155,
"lastSize": 100.0,
"netChange": 1.275,
"percentChange": 0.6644778,
...
}
}
"""
quote = QuoteData()
try:
reference = data.get('reference', {})
quote_data = data.get('quote', {})
regular_quote = data.get('regularQuote', {})
# Basic info
quote.exchange = reference.get('exchangeName')
# Price data
quote.price = _parse_float(quote_data.get('lastPrice'))
quote.change = _parse_float(quote_data.get('netChange'))
quote.change_percent = _parse_float(quote_data.get('netChangePercent'))
# After hours (post market)
quote.after_hours_change = _parse_float(quote_data.get('postMarketChange'))
quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange'))
# Extended quote data
quote.previous_close = _parse_float(regular_quote.get('closePrice'))
quote.open = _parse_float(regular_quote.get('openPrice'))
quote.bid = _parse_float(regular_quote.get('bidPrice'))
quote.ask = _parse_float(regular_quote.get('askPrice'))
quote.volume = _parse_volume(regular_quote.get('totalVolume'))
quote.day_range_low = _parse_float(regular_quote.get('lowPrice'))
quote.day_range_high = _parse_float(regular_quote.get('highPrice'))
quote.week_52_low = _parse_float(regular_quote.get('priceLow52W'))
quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W'))
# Bid/Ask size
bid_size = regular_quote.get('bidSize', 0)
ask_size = regular_quote.get('askSize', 0)
if bid_size or ask_size:
quote.bid_ask_size = f"{bid_size}/{ask_size}"
# Volume vs average
avg_volume_label = regular_quote.get('averageVolumeDaily')
if avg_volume_label:
quote.volume_vs_avg = avg_volume_label
except Exception as e:
logger.debug(f"Error parsing quote API response: {e}")
return quote
def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends:
"""Parse dividends API response into EnhancedDividends object.
API Response Structure:
{
"symbol": "JNJ",
"currentAnnualDividendMethod": "IAD",
"status": "DIVIDENDS_PAID_CURRENTLY",
"dividends": [
{
"dividendPayment": 1.3,
"dividendPayDate": "December 09, 2025",
"dividendExDate": "November 25, 2025",
"dividendFrequency": "Quarterly",
"annualDividendRate": 5.2,
"dividendYield": "2.71%"
},
...
]
}
"""
dividends = EnhancedDividends()
try:
dividend_list = data.get('dividends', [])
if not dividend_list:
return dividends
# Most recent dividend is first
latest = dividend_list[0]
# Next/upcoming dividend data
dividends.next_payment = _parse_float(latest.get('dividendPayment'))
dividends.next_pay_date = latest.get('dividendPayDate')
dividends.next_ex_date = latest.get('dividendExDate')
dividends.frequency = latest.get('dividendFrequency')
dividends.annual_rate = _parse_float(latest.get('annualDividendRate'))
dividends.annual_yield = _parse_float(latest.get('dividendYield'))
# Previous dividend (if there's more than one in history)
if len(dividend_list) > 1:
previous = dividend_list[1]
dividends.previous_payment = _parse_float(previous.get('dividendPayment'))
dividends.previous_pay_date = previous.get('dividendPayDate')
dividends.previous_ex_date = previous.get('dividendExDate')
except Exception as e:
logger.debug(f"Error parsing dividends API response: {e}")
return dividends
def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData:
"""Parse earnings API response into EarningsData object.
API Response Structure:
{
"symbol": "GOOGL",
"fundamentals": {},
"upcoming": {
"earningsDate": "10/29/2025",
"numberOfAnalysts": 43,
"epsNonGaapEstimate": 2.18
},
"historical": [
{
"epsGaapActual": 2.31,
"epsNonGaapActual": 2.31,
"earningsDate": "07/23/2025",
"numberOfAnalysts": 43,
"epsNonGaapEstimate": 2.18,
"epsNonGaapEstimateHigh": 2.42,
"epsNonGaapEstimateLow": 2.0
}
]
}
"""
earnings = EarningsData()
try:
upcoming = data.get('upcoming', {})
historical = data.get('historical', [])
fundamentals = data.get('fundamentals', {})
# Upcoming earnings
if upcoming:
earnings.next_announcement_date = upcoming.get('earningsDate')
earnings.announcement_timing = upcoming.get('announcementTiming')
earnings.analysts_covering = upcoming.get('numberOfAnalysts')
earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate'))
earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh'))
earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow'))
# Historical earnings (most recent)
if historical:
latest = historical[0]
earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual'))
# If we don't have upcoming, use latest historical for analyst data
if not upcoming:
earnings.analysts_covering = latest.get('numberOfAnalysts')
earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate'))
earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh'))
earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow'))
# Beat/miss information
beat_amount = latest.get('epsNonGaapBeat')
if beat_amount is not None:
earnings.recent_beats = [{
'beat_amount': _parse_float(beat_amount),
'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')),
'date': latest.get('earningsDate')
}]
# Fundamentals (PE ratios, revenue)
if fundamentals:
earnings.pe_ttm = _parse_float(fundamentals.get('peRatio'))
earnings.forward_pe = _parse_float(fundamentals.get('forwardPE'))
earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio'))
earnings.revenue_ttm = _parse_float(fundamentals.get('revenue'))
except Exception as e:
logger.debug(f"Error parsing earnings API response: {e}")
return earnings
def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData:
"""Parse share profile API response and enhance QuoteData with market cap, etc.
API Response Structure:
{
"companySummary": {
"marketCapLabel": "Large Cap",
"marketCapValue": "$462.11B",
"companyEnterpriseValue": "$462.11B"
},
"shareInfo": [{
"sharesOutstanding": "2.41B",
"sharesHeld": "71.29%"
}]
}
"""
try:
company_summary = data.get('companySummary', {})
# Market cap
quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue'))
# Sector info might be in other fields
# Note: Sector information may not be in shareprofile API
# It might be in securityprofiles or other endpoints
except Exception as e:
logger.debug(f"Error parsing share profile API response: {e}")
return quote
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]:
"""Call a Schwab API endpoint from within the browser's JavaScript context.
This uses page.evaluate() to run fetch() directly in the browser, which ensures
all cookies, authentication tokens, and session state are automatically included.
This is the most reliable way to call Schwab APIs.
Args:
page: Playwright page with authenticated session
url: API endpoint URL
debug: Enable debug logging
Returns:
Parsed JSON response or None on error
"""
try:
if debug:
logger.debug(f"Calling API: {url}")
# Generate correlation IDs
correlator_id = str(uuid.uuid4())
client_correlid = str(uuid.uuid4())
# Call API from within browser's JavaScript context using fetch()
# This automatically includes all cookies and session state
result = await page.evaluate("""
async ({url, correlatorId, clientCorrelId}) => {
try {
const response = await fetch(url, {
method: 'GET',
credentials: 'include', // Include cookies
headers: {
'accept': 'application/json',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'correlatorid': correlatorId,
'pragma': 'no-cache',
'schwab-client-appid': 'AD00007800',
'schwab-client-channel': 'IO',
'schwab-client-correlid': clientCorrelId,
'schwab-resource-version': '2',
}
});
if (!response.ok) {
const errorText = await response.text();
return {
success: false,
status: response.status,
error: errorText
};
}
const data = await response.json();
return {
success: true,
status: response.status,
data: data
};
} catch (error) {
return {
success: false,
error: error.toString()
};
}
}
""", {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid})
if not result.get('success'):
if debug:
status = result.get('status', 'unknown')
error = result.get('error', 'unknown error')
logger.debug(f"API returned status {status}: {str(error)[:200]}")
return None
data = result.get('data')
if debug and data:
logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}")
return data
except Exception as e:
if debug:
logger.debug(f"Error calling API {url}: {e}")
return None
async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data:
"""Extract Phase 1 data using Schwab's REST APIs.
This is the API-based replacement for the DOM scraping approach.
It calls Schwab's APIs directly using the authenticated session.
Args:
page: Playwright page with authenticated session
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
EquityPhase1Data with all extracted fields
"""
if debug:
logger.debug(f"Starting API-based Phase 1 extraction for {ticker}")
base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1"
# Build API URLs
quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true"
dividends_url = f"{base_url}/events/dividends?symbol={ticker}"
earnings_url = f"{base_url}/events/earnings?symbols={ticker}"
profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true"
# Make API calls using Playwright's request context (includes cookies automatically)
quote_data = await call_schwab_api(page, quote_url, debug)
dividends_data = await call_schwab_api(page, dividends_url, debug)
earnings_data = await call_schwab_api(page, earnings_url, debug)
profile_data = await call_schwab_api(page, profile_url, debug)
# Parse responses
# Quote API returns a list, get first item
if quote_data and isinstance(quote_data, list) and len(quote_data) > 0:
quote = parse_quote_api_response(quote_data[0])
elif quote_data and isinstance(quote_data, dict):
quote = parse_quote_api_response(quote_data)
else:
quote = QuoteData()
# Enhance quote with share profile data
if profile_data:
quote = parse_shareprofile_api_response(profile_data, quote)
# Parse dividends
dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends()
# Parse earnings
earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData()
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"API-based Phase 1 extraction complete for {ticker}")
# Count populated fields (dataclasses with slots don't have __dict__)
from dataclasses import fields as dataclass_fields
quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None)
div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None)
earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, []))
logger.debug(f" Quote fields populated: {quote_count}/21")
logger.debug(f" Dividend fields populated: {div_count}/9")
logger.debug(f" Earnings fields populated: {earn_count}/13")
return phase1_data