Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
This commit is contained in:
490
schwab_scraper/features/equity/phase1_api_scraper.py
Normal file
490
schwab_scraper/features/equity/phase1_api_scraper.py
Normal file
@@ -0,0 +1,490 @@
|
||||
"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL)
|
||||
|
||||
⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️
|
||||
|
||||
This module was an attempt to extract equity data by calling Schwab's REST APIs directly.
|
||||
While the APIs exist and were discovered via HAR analysis, they are NOT accessible from
|
||||
this scraper due to fundamental browser security limitations (CORS).
|
||||
|
||||
## Why This Approach Failed:
|
||||
|
||||
1. **CORS (Cross-Origin Resource Sharing) Restrictions**:
|
||||
- Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins)
|
||||
- Browser blocks cross-origin fetch() calls even from page.evaluate()
|
||||
- Results in "TypeError: Failed to fetch"
|
||||
|
||||
2. **Authentication Complexity**:
|
||||
- Direct HTTP (aiohttp) with cookies: 401/403 errors
|
||||
- Playwright page.request.fetch(): 401 errors (separate context)
|
||||
- Likely requires dynamic tokens beyond cookies
|
||||
|
||||
## Recommendation:
|
||||
|
||||
**Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated
|
||||
sessions and extracts all Phase 1 fields without CORS limitations.
|
||||
|
||||
## API Endpoints (discovered but inaccessible):
|
||||
- Quote: /api/is.ResearchExperience/v1/quote
|
||||
- Dividends: /api/is.ResearchExperience/v1/events/dividends
|
||||
- Earnings: /api/is.ResearchExperience/v1/events/earnings
|
||||
- Share Profile: /api/is.ResearchExperience/v1/shareprofile
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional, List
|
||||
import logging
|
||||
import uuid
|
||||
import aiohttp
|
||||
from playwright.async_api import Page
|
||||
|
||||
from ...core import (
|
||||
QuoteData, EnhancedDividends, EarningsData,
|
||||
CalculatedMetrics, EquityPhase1Data
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _parse_float(value: Any) -> Optional[float]:
|
||||
"""Safely parse a value to float."""
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
if isinstance(value, str):
|
||||
# Remove % sign if present
|
||||
value = value.replace('%', '').strip()
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def _parse_market_cap(value: str) -> Optional[str]:
|
||||
"""Parse market cap string like '$3.03T' or '$462.11B'."""
|
||||
if not value:
|
||||
return None
|
||||
# Keep the formatted string as-is for readability
|
||||
return value.strip()
|
||||
|
||||
|
||||
def _parse_volume(value: Any) -> Optional[int]:
|
||||
"""Parse volume value."""
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return int(float(value))
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData:
|
||||
"""Parse quote API response into QuoteData object.
|
||||
|
||||
API Response Structure:
|
||||
{
|
||||
"reference": {
|
||||
"symbol": "JNJ",
|
||||
"companyName": "JOHNSON & JOHNSON",
|
||||
"exchangeName": "NYSE"
|
||||
},
|
||||
"quote": {
|
||||
"lastPrice": 193.155,
|
||||
"netChange": 1.275,
|
||||
"netChangePercent": 0.6644778,
|
||||
"postMarketChange": 0.0,
|
||||
"postMarketPercentChange": 0.0,
|
||||
"tradeTime": "2025-10-22T17:06:42.008Z"
|
||||
},
|
||||
"regularQuote": {
|
||||
"lastPrice": 193.155,
|
||||
"lastSize": 100.0,
|
||||
"netChange": 1.275,
|
||||
"percentChange": 0.6644778,
|
||||
...
|
||||
}
|
||||
}
|
||||
"""
|
||||
quote = QuoteData()
|
||||
|
||||
try:
|
||||
reference = data.get('reference', {})
|
||||
quote_data = data.get('quote', {})
|
||||
regular_quote = data.get('regularQuote', {})
|
||||
|
||||
# Basic info
|
||||
quote.exchange = reference.get('exchangeName')
|
||||
|
||||
# Price data
|
||||
quote.price = _parse_float(quote_data.get('lastPrice'))
|
||||
quote.change = _parse_float(quote_data.get('netChange'))
|
||||
quote.change_percent = _parse_float(quote_data.get('netChangePercent'))
|
||||
|
||||
# After hours (post market)
|
||||
quote.after_hours_change = _parse_float(quote_data.get('postMarketChange'))
|
||||
quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange'))
|
||||
|
||||
# Extended quote data
|
||||
quote.previous_close = _parse_float(regular_quote.get('closePrice'))
|
||||
quote.open = _parse_float(regular_quote.get('openPrice'))
|
||||
quote.bid = _parse_float(regular_quote.get('bidPrice'))
|
||||
quote.ask = _parse_float(regular_quote.get('askPrice'))
|
||||
quote.volume = _parse_volume(regular_quote.get('totalVolume'))
|
||||
quote.day_range_low = _parse_float(regular_quote.get('lowPrice'))
|
||||
quote.day_range_high = _parse_float(regular_quote.get('highPrice'))
|
||||
quote.week_52_low = _parse_float(regular_quote.get('priceLow52W'))
|
||||
quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W'))
|
||||
|
||||
# Bid/Ask size
|
||||
bid_size = regular_quote.get('bidSize', 0)
|
||||
ask_size = regular_quote.get('askSize', 0)
|
||||
if bid_size or ask_size:
|
||||
quote.bid_ask_size = f"{bid_size}/{ask_size}"
|
||||
|
||||
# Volume vs average
|
||||
avg_volume_label = regular_quote.get('averageVolumeDaily')
|
||||
if avg_volume_label:
|
||||
quote.volume_vs_avg = avg_volume_label
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing quote API response: {e}")
|
||||
|
||||
return quote
|
||||
|
||||
|
||||
def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends:
|
||||
"""Parse dividends API response into EnhancedDividends object.
|
||||
|
||||
API Response Structure:
|
||||
{
|
||||
"symbol": "JNJ",
|
||||
"currentAnnualDividendMethod": "IAD",
|
||||
"status": "DIVIDENDS_PAID_CURRENTLY",
|
||||
"dividends": [
|
||||
{
|
||||
"dividendPayment": 1.3,
|
||||
"dividendPayDate": "December 09, 2025",
|
||||
"dividendExDate": "November 25, 2025",
|
||||
"dividendFrequency": "Quarterly",
|
||||
"annualDividendRate": 5.2,
|
||||
"dividendYield": "2.71%"
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
dividends = EnhancedDividends()
|
||||
|
||||
try:
|
||||
dividend_list = data.get('dividends', [])
|
||||
if not dividend_list:
|
||||
return dividends
|
||||
|
||||
# Most recent dividend is first
|
||||
latest = dividend_list[0]
|
||||
|
||||
# Next/upcoming dividend data
|
||||
dividends.next_payment = _parse_float(latest.get('dividendPayment'))
|
||||
dividends.next_pay_date = latest.get('dividendPayDate')
|
||||
dividends.next_ex_date = latest.get('dividendExDate')
|
||||
dividends.frequency = latest.get('dividendFrequency')
|
||||
dividends.annual_rate = _parse_float(latest.get('annualDividendRate'))
|
||||
dividends.annual_yield = _parse_float(latest.get('dividendYield'))
|
||||
|
||||
# Previous dividend (if there's more than one in history)
|
||||
if len(dividend_list) > 1:
|
||||
previous = dividend_list[1]
|
||||
dividends.previous_payment = _parse_float(previous.get('dividendPayment'))
|
||||
dividends.previous_pay_date = previous.get('dividendPayDate')
|
||||
dividends.previous_ex_date = previous.get('dividendExDate')
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing dividends API response: {e}")
|
||||
|
||||
return dividends
|
||||
|
||||
|
||||
def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData:
|
||||
"""Parse earnings API response into EarningsData object.
|
||||
|
||||
API Response Structure:
|
||||
{
|
||||
"symbol": "GOOGL",
|
||||
"fundamentals": {},
|
||||
"upcoming": {
|
||||
"earningsDate": "10/29/2025",
|
||||
"numberOfAnalysts": 43,
|
||||
"epsNonGaapEstimate": 2.18
|
||||
},
|
||||
"historical": [
|
||||
{
|
||||
"epsGaapActual": 2.31,
|
||||
"epsNonGaapActual": 2.31,
|
||||
"earningsDate": "07/23/2025",
|
||||
"numberOfAnalysts": 43,
|
||||
"epsNonGaapEstimate": 2.18,
|
||||
"epsNonGaapEstimateHigh": 2.42,
|
||||
"epsNonGaapEstimateLow": 2.0
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
earnings = EarningsData()
|
||||
|
||||
try:
|
||||
upcoming = data.get('upcoming', {})
|
||||
historical = data.get('historical', [])
|
||||
fundamentals = data.get('fundamentals', {})
|
||||
|
||||
# Upcoming earnings
|
||||
if upcoming:
|
||||
earnings.next_announcement_date = upcoming.get('earningsDate')
|
||||
earnings.announcement_timing = upcoming.get('announcementTiming')
|
||||
earnings.analysts_covering = upcoming.get('numberOfAnalysts')
|
||||
earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate'))
|
||||
earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh'))
|
||||
earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow'))
|
||||
|
||||
# Historical earnings (most recent)
|
||||
if historical:
|
||||
latest = historical[0]
|
||||
earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual'))
|
||||
|
||||
# If we don't have upcoming, use latest historical for analyst data
|
||||
if not upcoming:
|
||||
earnings.analysts_covering = latest.get('numberOfAnalysts')
|
||||
earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate'))
|
||||
earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh'))
|
||||
earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow'))
|
||||
|
||||
# Beat/miss information
|
||||
beat_amount = latest.get('epsNonGaapBeat')
|
||||
if beat_amount is not None:
|
||||
earnings.recent_beats = [{
|
||||
'beat_amount': _parse_float(beat_amount),
|
||||
'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')),
|
||||
'date': latest.get('earningsDate')
|
||||
}]
|
||||
|
||||
# Fundamentals (PE ratios, revenue)
|
||||
if fundamentals:
|
||||
earnings.pe_ttm = _parse_float(fundamentals.get('peRatio'))
|
||||
earnings.forward_pe = _parse_float(fundamentals.get('forwardPE'))
|
||||
earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio'))
|
||||
earnings.revenue_ttm = _parse_float(fundamentals.get('revenue'))
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing earnings API response: {e}")
|
||||
|
||||
return earnings
|
||||
|
||||
|
||||
def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData:
|
||||
"""Parse share profile API response and enhance QuoteData with market cap, etc.
|
||||
|
||||
API Response Structure:
|
||||
{
|
||||
"companySummary": {
|
||||
"marketCapLabel": "Large Cap",
|
||||
"marketCapValue": "$462.11B",
|
||||
"companyEnterpriseValue": "$462.11B"
|
||||
},
|
||||
"shareInfo": [{
|
||||
"sharesOutstanding": "2.41B",
|
||||
"sharesHeld": "71.29%"
|
||||
}]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
company_summary = data.get('companySummary', {})
|
||||
|
||||
# Market cap
|
||||
quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue'))
|
||||
|
||||
# Sector info might be in other fields
|
||||
# Note: Sector information may not be in shareprofile API
|
||||
# It might be in securityprofiles or other endpoints
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing share profile API response: {e}")
|
||||
|
||||
return quote
|
||||
|
||||
|
||||
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
|
||||
"""Calculate dividend payout ratio.
|
||||
|
||||
Formula: (Annual Dividend Rate / EPS TTM) × 100
|
||||
"""
|
||||
if annual_dividend and eps_ttm and eps_ttm > 0:
|
||||
ratio = (annual_dividend / eps_ttm) * 100
|
||||
return round(ratio, 2)
|
||||
return None
|
||||
|
||||
|
||||
async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]:
|
||||
"""Call a Schwab API endpoint from within the browser's JavaScript context.
|
||||
|
||||
This uses page.evaluate() to run fetch() directly in the browser, which ensures
|
||||
all cookies, authentication tokens, and session state are automatically included.
|
||||
This is the most reliable way to call Schwab APIs.
|
||||
|
||||
Args:
|
||||
page: Playwright page with authenticated session
|
||||
url: API endpoint URL
|
||||
debug: Enable debug logging
|
||||
|
||||
Returns:
|
||||
Parsed JSON response or None on error
|
||||
"""
|
||||
try:
|
||||
if debug:
|
||||
logger.debug(f"Calling API: {url}")
|
||||
|
||||
# Generate correlation IDs
|
||||
correlator_id = str(uuid.uuid4())
|
||||
client_correlid = str(uuid.uuid4())
|
||||
|
||||
# Call API from within browser's JavaScript context using fetch()
|
||||
# This automatically includes all cookies and session state
|
||||
result = await page.evaluate("""
|
||||
async ({url, correlatorId, clientCorrelId}) => {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
credentials: 'include', // Include cookies
|
||||
headers: {
|
||||
'accept': 'application/json',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'cache-control': 'no-cache',
|
||||
'content-type': 'application/json',
|
||||
'correlatorid': correlatorId,
|
||||
'pragma': 'no-cache',
|
||||
'schwab-client-appid': 'AD00007800',
|
||||
'schwab-client-channel': 'IO',
|
||||
'schwab-client-correlid': clientCorrelId,
|
||||
'schwab-resource-version': '2',
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
return {
|
||||
success: false,
|
||||
status: response.status,
|
||||
error: errorText
|
||||
};
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return {
|
||||
success: true,
|
||||
status: response.status,
|
||||
data: data
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.toString()
|
||||
};
|
||||
}
|
||||
}
|
||||
""", {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid})
|
||||
|
||||
if not result.get('success'):
|
||||
if debug:
|
||||
status = result.get('status', 'unknown')
|
||||
error = result.get('error', 'unknown error')
|
||||
logger.debug(f"API returned status {status}: {str(error)[:200]}")
|
||||
return None
|
||||
|
||||
data = result.get('data')
|
||||
|
||||
if debug and data:
|
||||
logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}")
|
||||
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
if debug:
|
||||
logger.debug(f"Error calling API {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data:
|
||||
"""Extract Phase 1 data using Schwab's REST APIs.
|
||||
|
||||
This is the API-based replacement for the DOM scraping approach.
|
||||
It calls Schwab's APIs directly using the authenticated session.
|
||||
|
||||
Args:
|
||||
page: Playwright page with authenticated session
|
||||
ticker: Stock ticker symbol
|
||||
debug: Enable debug logging
|
||||
|
||||
Returns:
|
||||
EquityPhase1Data with all extracted fields
|
||||
"""
|
||||
if debug:
|
||||
logger.debug(f"Starting API-based Phase 1 extraction for {ticker}")
|
||||
|
||||
base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1"
|
||||
|
||||
# Build API URLs
|
||||
quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true"
|
||||
dividends_url = f"{base_url}/events/dividends?symbol={ticker}"
|
||||
earnings_url = f"{base_url}/events/earnings?symbols={ticker}"
|
||||
profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true"
|
||||
|
||||
# Make API calls using Playwright's request context (includes cookies automatically)
|
||||
quote_data = await call_schwab_api(page, quote_url, debug)
|
||||
dividends_data = await call_schwab_api(page, dividends_url, debug)
|
||||
earnings_data = await call_schwab_api(page, earnings_url, debug)
|
||||
profile_data = await call_schwab_api(page, profile_url, debug)
|
||||
|
||||
# Parse responses
|
||||
# Quote API returns a list, get first item
|
||||
if quote_data and isinstance(quote_data, list) and len(quote_data) > 0:
|
||||
quote = parse_quote_api_response(quote_data[0])
|
||||
elif quote_data and isinstance(quote_data, dict):
|
||||
quote = parse_quote_api_response(quote_data)
|
||||
else:
|
||||
quote = QuoteData()
|
||||
|
||||
# Enhance quote with share profile data
|
||||
if profile_data:
|
||||
quote = parse_shareprofile_api_response(profile_data, quote)
|
||||
|
||||
# Parse dividends
|
||||
dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends()
|
||||
|
||||
# Parse earnings
|
||||
earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData()
|
||||
|
||||
# Calculate derived metrics
|
||||
calculated = CalculatedMetrics()
|
||||
if dividends.annual_rate and earnings.eps_ttm:
|
||||
calculated.payout_ratio = calculate_payout_ratio(
|
||||
dividends.annual_rate,
|
||||
earnings.eps_ttm
|
||||
)
|
||||
|
||||
# Create Phase 1 data object
|
||||
phase1_data = EquityPhase1Data(
|
||||
ticker=ticker,
|
||||
quote=quote,
|
||||
dividends=dividends,
|
||||
earnings=earnings,
|
||||
calculated_metrics=calculated
|
||||
)
|
||||
|
||||
if debug:
|
||||
logger.debug(f"API-based Phase 1 extraction complete for {ticker}")
|
||||
# Count populated fields (dataclasses with slots don't have __dict__)
|
||||
from dataclasses import fields as dataclass_fields
|
||||
quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None)
|
||||
div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None)
|
||||
earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, []))
|
||||
logger.debug(f" Quote fields populated: {quote_count}/21")
|
||||
logger.debug(f" Dividend fields populated: {div_count}/9")
|
||||
logger.debug(f" Earnings fields populated: {earn_count}/13")
|
||||
|
||||
return phase1_data
|
||||
|
||||
Reference in New Issue
Block a user