"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL) ⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️ This module was an attempt to extract equity data by calling Schwab's REST APIs directly. While the APIs exist and were discovered via HAR analysis, they are NOT accessible from this scraper due to fundamental browser security limitations (CORS). ## Why This Approach Failed: 1. **CORS (Cross-Origin Resource Sharing) Restrictions**: - Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins) - Browser blocks cross-origin fetch() calls even from page.evaluate() - Results in "TypeError: Failed to fetch" 2. **Authentication Complexity**: - Direct HTTP (aiohttp) with cookies: 401/403 errors - Playwright page.request.fetch(): 401 errors (separate context) - Likely requires dynamic tokens beyond cookies ## Recommendation: **Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated sessions and extracts all Phase 1 fields without CORS limitations. ## API Endpoints (discovered but inaccessible): - Quote: /api/is.ResearchExperience/v1/quote - Dividends: /api/is.ResearchExperience/v1/events/dividends - Earnings: /api/is.ResearchExperience/v1/events/earnings - Share Profile: /api/is.ResearchExperience/v1/shareprofile """ from typing import Dict, Any, Optional, List import logging import uuid import aiohttp from playwright.async_api import Page from ...core import ( QuoteData, EnhancedDividends, EarningsData, CalculatedMetrics, EquityPhase1Data ) logger = logging.getLogger(__name__) def _parse_float(value: Any) -> Optional[float]: """Safely parse a value to float.""" if value is None: return None try: if isinstance(value, str): # Remove % sign if present value = value.replace('%', '').strip() return float(value) except (ValueError, TypeError): return None def _parse_market_cap(value: str) -> Optional[str]: """Parse market cap string like '$3.03T' or '$462.11B'.""" if not value: return None # Keep the formatted string as-is for readability return value.strip() def _parse_volume(value: Any) -> Optional[int]: """Parse volume value.""" if value is None: return None try: return int(float(value)) except (ValueError, TypeError): return None def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData: """Parse quote API response into QuoteData object. API Response Structure: { "reference": { "symbol": "JNJ", "companyName": "JOHNSON & JOHNSON", "exchangeName": "NYSE" }, "quote": { "lastPrice": 193.155, "netChange": 1.275, "netChangePercent": 0.6644778, "postMarketChange": 0.0, "postMarketPercentChange": 0.0, "tradeTime": "2025-10-22T17:06:42.008Z" }, "regularQuote": { "lastPrice": 193.155, "lastSize": 100.0, "netChange": 1.275, "percentChange": 0.6644778, ... } } """ quote = QuoteData() try: reference = data.get('reference', {}) quote_data = data.get('quote', {}) regular_quote = data.get('regularQuote', {}) # Basic info quote.exchange = reference.get('exchangeName') # Price data quote.price = _parse_float(quote_data.get('lastPrice')) quote.change = _parse_float(quote_data.get('netChange')) quote.change_percent = _parse_float(quote_data.get('netChangePercent')) # After hours (post market) quote.after_hours_change = _parse_float(quote_data.get('postMarketChange')) quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange')) # Extended quote data quote.previous_close = _parse_float(regular_quote.get('closePrice')) quote.open = _parse_float(regular_quote.get('openPrice')) quote.bid = _parse_float(regular_quote.get('bidPrice')) quote.ask = _parse_float(regular_quote.get('askPrice')) quote.volume = _parse_volume(regular_quote.get('totalVolume')) quote.day_range_low = _parse_float(regular_quote.get('lowPrice')) quote.day_range_high = _parse_float(regular_quote.get('highPrice')) quote.week_52_low = _parse_float(regular_quote.get('priceLow52W')) quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W')) # Bid/Ask size bid_size = regular_quote.get('bidSize', 0) ask_size = regular_quote.get('askSize', 0) if bid_size or ask_size: quote.bid_ask_size = f"{bid_size}/{ask_size}" # Volume vs average avg_volume_label = regular_quote.get('averageVolumeDaily') if avg_volume_label: quote.volume_vs_avg = avg_volume_label except Exception as e: logger.debug(f"Error parsing quote API response: {e}") return quote def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends: """Parse dividends API response into EnhancedDividends object. API Response Structure: { "symbol": "JNJ", "currentAnnualDividendMethod": "IAD", "status": "DIVIDENDS_PAID_CURRENTLY", "dividends": [ { "dividendPayment": 1.3, "dividendPayDate": "December 09, 2025", "dividendExDate": "November 25, 2025", "dividendFrequency": "Quarterly", "annualDividendRate": 5.2, "dividendYield": "2.71%" }, ... ] } """ dividends = EnhancedDividends() try: dividend_list = data.get('dividends', []) if not dividend_list: return dividends # Most recent dividend is first latest = dividend_list[0] # Next/upcoming dividend data dividends.next_payment = _parse_float(latest.get('dividendPayment')) dividends.next_pay_date = latest.get('dividendPayDate') dividends.next_ex_date = latest.get('dividendExDate') dividends.frequency = latest.get('dividendFrequency') dividends.annual_rate = _parse_float(latest.get('annualDividendRate')) dividends.annual_yield = _parse_float(latest.get('dividendYield')) # Previous dividend (if there's more than one in history) if len(dividend_list) > 1: previous = dividend_list[1] dividends.previous_payment = _parse_float(previous.get('dividendPayment')) dividends.previous_pay_date = previous.get('dividendPayDate') dividends.previous_ex_date = previous.get('dividendExDate') except Exception as e: logger.debug(f"Error parsing dividends API response: {e}") return dividends def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData: """Parse earnings API response into EarningsData object. API Response Structure: { "symbol": "GOOGL", "fundamentals": {}, "upcoming": { "earningsDate": "10/29/2025", "numberOfAnalysts": 43, "epsNonGaapEstimate": 2.18 }, "historical": [ { "epsGaapActual": 2.31, "epsNonGaapActual": 2.31, "earningsDate": "07/23/2025", "numberOfAnalysts": 43, "epsNonGaapEstimate": 2.18, "epsNonGaapEstimateHigh": 2.42, "epsNonGaapEstimateLow": 2.0 } ] } """ earnings = EarningsData() try: upcoming = data.get('upcoming', {}) historical = data.get('historical', []) fundamentals = data.get('fundamentals', {}) # Upcoming earnings if upcoming: earnings.next_announcement_date = upcoming.get('earningsDate') earnings.announcement_timing = upcoming.get('announcementTiming') earnings.analysts_covering = upcoming.get('numberOfAnalysts') earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate')) earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh')) earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow')) # Historical earnings (most recent) if historical: latest = historical[0] earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual')) # If we don't have upcoming, use latest historical for analyst data if not upcoming: earnings.analysts_covering = latest.get('numberOfAnalysts') earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate')) earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh')) earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow')) # Beat/miss information beat_amount = latest.get('epsNonGaapBeat') if beat_amount is not None: earnings.recent_beats = [{ 'beat_amount': _parse_float(beat_amount), 'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')), 'date': latest.get('earningsDate') }] # Fundamentals (PE ratios, revenue) if fundamentals: earnings.pe_ttm = _parse_float(fundamentals.get('peRatio')) earnings.forward_pe = _parse_float(fundamentals.get('forwardPE')) earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio')) earnings.revenue_ttm = _parse_float(fundamentals.get('revenue')) except Exception as e: logger.debug(f"Error parsing earnings API response: {e}") return earnings def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData: """Parse share profile API response and enhance QuoteData with market cap, etc. API Response Structure: { "companySummary": { "marketCapLabel": "Large Cap", "marketCapValue": "$462.11B", "companyEnterpriseValue": "$462.11B" }, "shareInfo": [{ "sharesOutstanding": "2.41B", "sharesHeld": "71.29%" }] } """ try: company_summary = data.get('companySummary', {}) # Market cap quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue')) # Sector info might be in other fields # Note: Sector information may not be in shareprofile API # It might be in securityprofiles or other endpoints except Exception as e: logger.debug(f"Error parsing share profile API response: {e}") return quote def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]: """Calculate dividend payout ratio. Formula: (Annual Dividend Rate / EPS TTM) × 100 """ if annual_dividend and eps_ttm and eps_ttm > 0: ratio = (annual_dividend / eps_ttm) * 100 return round(ratio, 2) return None async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]: """Call a Schwab API endpoint from within the browser's JavaScript context. This uses page.evaluate() to run fetch() directly in the browser, which ensures all cookies, authentication tokens, and session state are automatically included. This is the most reliable way to call Schwab APIs. Args: page: Playwright page with authenticated session url: API endpoint URL debug: Enable debug logging Returns: Parsed JSON response or None on error """ try: if debug: logger.debug(f"Calling API: {url}") # Generate correlation IDs correlator_id = str(uuid.uuid4()) client_correlid = str(uuid.uuid4()) # Call API from within browser's JavaScript context using fetch() # This automatically includes all cookies and session state result = await page.evaluate(""" async ({url, correlatorId, clientCorrelId}) => { try { const response = await fetch(url, { method: 'GET', credentials: 'include', // Include cookies headers: { 'accept': 'application/json', 'accept-language': 'en-US,en;q=0.9', 'cache-control': 'no-cache', 'content-type': 'application/json', 'correlatorid': correlatorId, 'pragma': 'no-cache', 'schwab-client-appid': 'AD00007800', 'schwab-client-channel': 'IO', 'schwab-client-correlid': clientCorrelId, 'schwab-resource-version': '2', } }); if (!response.ok) { const errorText = await response.text(); return { success: false, status: response.status, error: errorText }; } const data = await response.json(); return { success: true, status: response.status, data: data }; } catch (error) { return { success: false, error: error.toString() }; } } """, {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid}) if not result.get('success'): if debug: status = result.get('status', 'unknown') error = result.get('error', 'unknown error') logger.debug(f"API returned status {status}: {str(error)[:200]}") return None data = result.get('data') if debug and data: logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}") return data except Exception as e: if debug: logger.debug(f"Error calling API {url}: {e}") return None async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data: """Extract Phase 1 data using Schwab's REST APIs. This is the API-based replacement for the DOM scraping approach. It calls Schwab's APIs directly using the authenticated session. Args: page: Playwright page with authenticated session ticker: Stock ticker symbol debug: Enable debug logging Returns: EquityPhase1Data with all extracted fields """ if debug: logger.debug(f"Starting API-based Phase 1 extraction for {ticker}") base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1" # Build API URLs quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true" dividends_url = f"{base_url}/events/dividends?symbol={ticker}" earnings_url = f"{base_url}/events/earnings?symbols={ticker}" profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true" # Make API calls using Playwright's request context (includes cookies automatically) quote_data = await call_schwab_api(page, quote_url, debug) dividends_data = await call_schwab_api(page, dividends_url, debug) earnings_data = await call_schwab_api(page, earnings_url, debug) profile_data = await call_schwab_api(page, profile_url, debug) # Parse responses # Quote API returns a list, get first item if quote_data and isinstance(quote_data, list) and len(quote_data) > 0: quote = parse_quote_api_response(quote_data[0]) elif quote_data and isinstance(quote_data, dict): quote = parse_quote_api_response(quote_data) else: quote = QuoteData() # Enhance quote with share profile data if profile_data: quote = parse_shareprofile_api_response(profile_data, quote) # Parse dividends dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends() # Parse earnings earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData() # Calculate derived metrics calculated = CalculatedMetrics() if dividends.annual_rate and earnings.eps_ttm: calculated.payout_ratio = calculate_payout_ratio( dividends.annual_rate, earnings.eps_ttm ) # Create Phase 1 data object phase1_data = EquityPhase1Data( ticker=ticker, quote=quote, dividends=dividends, earnings=earnings, calculated_metrics=calculated ) if debug: logger.debug(f"API-based Phase 1 extraction complete for {ticker}") # Count populated fields (dataclasses with slots don't have __dict__) from dataclasses import fields as dataclass_fields quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None) div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None) earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, [])) logger.debug(f" Quote fields populated: {quote_count}/21") logger.debug(f" Dividend fields populated: {div_count}/9") logger.debug(f" Earnings fields populated: {earn_count}/13") return phase1_data