Fix build: Bundle schwab_scraper source and use local dependencies

2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions
--- a/schwab_scraper/features/equity/phase1_api_scraper.py
+++ b/schwab_scraper/features/equity/phase1_api_scraper.py
@@ -0,0 +1,490 @@
+"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL)
+
+⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️
+
+This module was an attempt to extract equity data by calling Schwab's REST APIs directly.
+While the APIs exist and were discovered via HAR analysis, they are NOT accessible from 
+this scraper due to fundamental browser security limitations (CORS).
+
+## Why This Approach Failed:
+
+1. **CORS (Cross-Origin Resource Sharing) Restrictions**: 
+   - Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins)
+   - Browser blocks cross-origin fetch() calls even from page.evaluate()
+   - Results in "TypeError: Failed to fetch"
+
+2. **Authentication Complexity**:
+   - Direct HTTP (aiohttp) with cookies: 401/403 errors
+   - Playwright page.request.fetch(): 401 errors (separate context)
+   - Likely requires dynamic tokens beyond cookies
+
+## Recommendation:
+
+**Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated
+sessions and extracts all Phase 1 fields without CORS limitations.
+
+## API Endpoints (discovered but inaccessible):
+- Quote: /api/is.ResearchExperience/v1/quote
+- Dividends: /api/is.ResearchExperience/v1/events/dividends
+- Earnings: /api/is.ResearchExperience/v1/events/earnings
+- Share Profile: /api/is.ResearchExperience/v1/shareprofile
+"""
+
+from typing import Dict, Any, Optional, List
+import logging
+import uuid
+import aiohttp
+from playwright.async_api import Page
+
+from ...core import (
+    QuoteData, EnhancedDividends, EarningsData, 
+    CalculatedMetrics, EquityPhase1Data
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _parse_float(value: Any) -> Optional[float]:
+    """Safely parse a value to float."""
+    if value is None:
+        return None
+    try:
+        if isinstance(value, str):
+            # Remove % sign if present
+            value = value.replace('%', '').strip()
+        return float(value)
+    except (ValueError, TypeError):
+        return None
+
+
+def _parse_market_cap(value: str) -> Optional[str]:
+    """Parse market cap string like '$3.03T' or '$462.11B'."""
+    if not value:
+        return None
+    # Keep the formatted string as-is for readability
+    return value.strip()
+
+
+def _parse_volume(value: Any) -> Optional[int]:
+    """Parse volume value."""
+    if value is None:
+        return None
+    try:
+        return int(float(value))
+    except (ValueError, TypeError):
+        return None
+
+
+def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData:
+    """Parse quote API response into QuoteData object.
+    
+    API Response Structure:
+    {
+      "reference": {
+        "symbol": "JNJ",
+        "companyName": "JOHNSON & JOHNSON",
+        "exchangeName": "NYSE"
+      },
+      "quote": {
+        "lastPrice": 193.155,
+        "netChange": 1.275,
+        "netChangePercent": 0.6644778,
+        "postMarketChange": 0.0,
+        "postMarketPercentChange": 0.0,
+        "tradeTime": "2025-10-22T17:06:42.008Z"
+      },
+      "regularQuote": {
+        "lastPrice": 193.155,
+        "lastSize": 100.0,
+        "netChange": 1.275,
+        "percentChange": 0.6644778,
+        ...
+      }
+    }
+    """
+    quote = QuoteData()
+    
+    try:
+        reference = data.get('reference', {})
+        quote_data = data.get('quote', {})
+        regular_quote = data.get('regularQuote', {})
+        
+        # Basic info
+        quote.exchange = reference.get('exchangeName')
+        
+        # Price data
+        quote.price = _parse_float(quote_data.get('lastPrice'))
+        quote.change = _parse_float(quote_data.get('netChange'))
+        quote.change_percent = _parse_float(quote_data.get('netChangePercent'))
+        
+        # After hours (post market)
+        quote.after_hours_change = _parse_float(quote_data.get('postMarketChange'))
+        quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange'))
+        
+        # Extended quote data
+        quote.previous_close = _parse_float(regular_quote.get('closePrice'))
+        quote.open = _parse_float(regular_quote.get('openPrice'))
+        quote.bid = _parse_float(regular_quote.get('bidPrice'))
+        quote.ask = _parse_float(regular_quote.get('askPrice'))
+        quote.volume = _parse_volume(regular_quote.get('totalVolume'))
+        quote.day_range_low = _parse_float(regular_quote.get('lowPrice'))
+        quote.day_range_high = _parse_float(regular_quote.get('highPrice'))
+        quote.week_52_low = _parse_float(regular_quote.get('priceLow52W'))
+        quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W'))
+        
+        # Bid/Ask size
+        bid_size = regular_quote.get('bidSize', 0)
+        ask_size = regular_quote.get('askSize', 0)
+        if bid_size or ask_size:
+            quote.bid_ask_size = f"{bid_size}/{ask_size}"
+        
+        # Volume vs average
+        avg_volume_label = regular_quote.get('averageVolumeDaily')
+        if avg_volume_label:
+            quote.volume_vs_avg = avg_volume_label
+            
+    except Exception as e:
+        logger.debug(f"Error parsing quote API response: {e}")
+    
+    return quote
+
+
+def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends:
+    """Parse dividends API response into EnhancedDividends object.
+    
+    API Response Structure:
+    {
+      "symbol": "JNJ",
+      "currentAnnualDividendMethod": "IAD",
+      "status": "DIVIDENDS_PAID_CURRENTLY",
+      "dividends": [
+        {
+          "dividendPayment": 1.3,
+          "dividendPayDate": "December 09, 2025",
+          "dividendExDate": "November 25, 2025",
+          "dividendFrequency": "Quarterly",
+          "annualDividendRate": 5.2,
+          "dividendYield": "2.71%"
+        },
+        ...
+      ]
+    }
+    """
+    dividends = EnhancedDividends()
+    
+    try:
+        dividend_list = data.get('dividends', [])
+        if not dividend_list:
+            return dividends
+        
+        # Most recent dividend is first
+        latest = dividend_list[0]
+        
+        # Next/upcoming dividend data
+        dividends.next_payment = _parse_float(latest.get('dividendPayment'))
+        dividends.next_pay_date = latest.get('dividendPayDate')
+        dividends.next_ex_date = latest.get('dividendExDate')
+        dividends.frequency = latest.get('dividendFrequency')
+        dividends.annual_rate = _parse_float(latest.get('annualDividendRate'))
+        dividends.annual_yield = _parse_float(latest.get('dividendYield'))
+        
+        # Previous dividend (if there's more than one in history)
+        if len(dividend_list) > 1:
+            previous = dividend_list[1]
+            dividends.previous_payment = _parse_float(previous.get('dividendPayment'))
+            dividends.previous_pay_date = previous.get('dividendPayDate')
+            dividends.previous_ex_date = previous.get('dividendExDate')
+            
+    except Exception as e:
+        logger.debug(f"Error parsing dividends API response: {e}")
+    
+    return dividends
+
+
+def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData:
+    """Parse earnings API response into EarningsData object.
+    
+    API Response Structure:
+    {
+      "symbol": "GOOGL",
+      "fundamentals": {},
+      "upcoming": {
+        "earningsDate": "10/29/2025",
+        "numberOfAnalysts": 43,
+        "epsNonGaapEstimate": 2.18
+      },
+      "historical": [
+        {
+          "epsGaapActual": 2.31,
+          "epsNonGaapActual": 2.31,
+          "earningsDate": "07/23/2025",
+          "numberOfAnalysts": 43,
+          "epsNonGaapEstimate": 2.18,
+          "epsNonGaapEstimateHigh": 2.42,
+          "epsNonGaapEstimateLow": 2.0
+        }
+      ]
+    }
+    """
+    earnings = EarningsData()
+    
+    try:
+        upcoming = data.get('upcoming', {})
+        historical = data.get('historical', [])
+        fundamentals = data.get('fundamentals', {})
+        
+        # Upcoming earnings
+        if upcoming:
+            earnings.next_announcement_date = upcoming.get('earningsDate')
+            earnings.announcement_timing = upcoming.get('announcementTiming')
+            earnings.analysts_covering = upcoming.get('numberOfAnalysts')
+            earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate'))
+            earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh'))
+            earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow'))
+        
+        # Historical earnings (most recent)
+        if historical:
+            latest = historical[0]
+            earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual'))
+            
+            # If we don't have upcoming, use latest historical for analyst data
+            if not upcoming:
+                earnings.analysts_covering = latest.get('numberOfAnalysts')
+                earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate'))
+                earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh'))
+                earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow'))
+            
+            # Beat/miss information
+            beat_amount = latest.get('epsNonGaapBeat')
+            if beat_amount is not None:
+                earnings.recent_beats = [{
+                    'beat_amount': _parse_float(beat_amount),
+                    'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')),
+                    'date': latest.get('earningsDate')
+                }]
+        
+        # Fundamentals (PE ratios, revenue)
+        if fundamentals:
+            earnings.pe_ttm = _parse_float(fundamentals.get('peRatio'))
+            earnings.forward_pe = _parse_float(fundamentals.get('forwardPE'))
+            earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio'))
+            earnings.revenue_ttm = _parse_float(fundamentals.get('revenue'))
+            
+    except Exception as e:
+        logger.debug(f"Error parsing earnings API response: {e}")
+    
+    return earnings
+
+
+def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData:
+    """Parse share profile API response and enhance QuoteData with market cap, etc.
+    
+    API Response Structure:
+    {
+      "companySummary": {
+        "marketCapLabel": "Large Cap",
+        "marketCapValue": "$462.11B",
+        "companyEnterpriseValue": "$462.11B"
+      },
+      "shareInfo": [{
+        "sharesOutstanding": "2.41B",
+        "sharesHeld": "71.29%"
+      }]
+    }
+    """
+    try:
+        company_summary = data.get('companySummary', {})
+        
+        # Market cap
+        quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue'))
+        
+        # Sector info might be in other fields
+        # Note: Sector information may not be in shareprofile API
+        # It might be in securityprofiles or other endpoints
+        
+    except Exception as e:
+        logger.debug(f"Error parsing share profile API response: {e}")
+    
+    return quote
+
+
+def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
+    """Calculate dividend payout ratio.
+    
+    Formula: (Annual Dividend Rate / EPS TTM) × 100
+    """
+    if annual_dividend and eps_ttm and eps_ttm > 0:
+        ratio = (annual_dividend / eps_ttm) * 100
+        return round(ratio, 2)
+    return None
+
+
+async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]:
+    """Call a Schwab API endpoint from within the browser's JavaScript context.
+    
+    This uses page.evaluate() to run fetch() directly in the browser, which ensures
+    all cookies, authentication tokens, and session state are automatically included.
+    This is the most reliable way to call Schwab APIs.
+    
+    Args:
+        page: Playwright page with authenticated session
+        url: API endpoint URL
+        debug: Enable debug logging
+        
+    Returns:
+        Parsed JSON response or None on error
+    """
+    try:
+        if debug:
+            logger.debug(f"Calling API: {url}")
+        
+        # Generate correlation IDs
+        correlator_id = str(uuid.uuid4())
+        client_correlid = str(uuid.uuid4())
+        
+        # Call API from within browser's JavaScript context using fetch()
+        # This automatically includes all cookies and session state
+        result = await page.evaluate("""
+            async ({url, correlatorId, clientCorrelId}) => {
+                try {
+                    const response = await fetch(url, {
+                        method: 'GET',
+                        credentials: 'include',  // Include cookies
+                        headers: {
+                            'accept': 'application/json',
+                            'accept-language': 'en-US,en;q=0.9',
+                            'cache-control': 'no-cache',
+                            'content-type': 'application/json',
+                            'correlatorid': correlatorId,
+                            'pragma': 'no-cache',
+                            'schwab-client-appid': 'AD00007800',
+                            'schwab-client-channel': 'IO',
+                            'schwab-client-correlid': clientCorrelId,
+                            'schwab-resource-version': '2',
+                        }
+                    });
+                    
+                    if (!response.ok) {
+                        const errorText = await response.text();
+                        return {
+                            success: false,
+                            status: response.status,
+                            error: errorText
+                        };
+                    }
+                    
+                    const data = await response.json();
+                    return {
+                        success: true,
+                        status: response.status,
+                        data: data
+                    };
+                } catch (error) {
+                    return {
+                        success: false,
+                        error: error.toString()
+                    };
+                }
+            }
+        """, {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid})
+        
+        if not result.get('success'):
+            if debug:
+                status = result.get('status', 'unknown')
+                error = result.get('error', 'unknown error')
+                logger.debug(f"API returned status {status}: {str(error)[:200]}")
+            return None
+        
+        data = result.get('data')
+        
+        if debug and data:
+            logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}")
+        
+        return data
+        
+    except Exception as e:
+        if debug:
+            logger.debug(f"Error calling API {url}: {e}")
+        return None
+
+
+async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data:
+    """Extract Phase 1 data using Schwab's REST APIs.
+    
+    This is the API-based replacement for the DOM scraping approach.
+    It calls Schwab's APIs directly using the authenticated session.
+    
+    Args:
+        page: Playwright page with authenticated session
+        ticker: Stock ticker symbol
+        debug: Enable debug logging
+        
+    Returns:
+        EquityPhase1Data with all extracted fields
+    """
+    if debug:
+        logger.debug(f"Starting API-based Phase 1 extraction for {ticker}")
+    
+    base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1"
+    
+    # Build API URLs
+    quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true"
+    dividends_url = f"{base_url}/events/dividends?symbol={ticker}"
+    earnings_url = f"{base_url}/events/earnings?symbols={ticker}"
+    profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true"
+    
+    # Make API calls using Playwright's request context (includes cookies automatically)
+    quote_data = await call_schwab_api(page, quote_url, debug)
+    dividends_data = await call_schwab_api(page, dividends_url, debug)
+    earnings_data = await call_schwab_api(page, earnings_url, debug)
+    profile_data = await call_schwab_api(page, profile_url, debug)
+    
+    # Parse responses
+    # Quote API returns a list, get first item
+    if quote_data and isinstance(quote_data, list) and len(quote_data) > 0:
+        quote = parse_quote_api_response(quote_data[0])
+    elif quote_data and isinstance(quote_data, dict):
+        quote = parse_quote_api_response(quote_data)
+    else:
+        quote = QuoteData()
+    
+    # Enhance quote with share profile data
+    if profile_data:
+        quote = parse_shareprofile_api_response(profile_data, quote)
+    
+    # Parse dividends
+    dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends()
+    
+    # Parse earnings
+    earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData()
+    
+    # Calculate derived metrics
+    calculated = CalculatedMetrics()
+    if dividends.annual_rate and earnings.eps_ttm:
+        calculated.payout_ratio = calculate_payout_ratio(
+            dividends.annual_rate,
+            earnings.eps_ttm
+        )
+    
+    # Create Phase 1 data object
+    phase1_data = EquityPhase1Data(
+        ticker=ticker,
+        quote=quote,
+        dividends=dividends,
+        earnings=earnings,
+        calculated_metrics=calculated
+    )
+    
+    if debug:
+        logger.debug(f"API-based Phase 1 extraction complete for {ticker}")
+        # Count populated fields (dataclasses with slots don't have __dict__)
+        from dataclasses import fields as dataclass_fields
+        quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None)
+        div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None)
+        earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, []))
+        logger.debug(f"  Quote fields populated: {quote_count}/21")
+        logger.debug(f"  Dividend fields populated: {div_count}/9")
+        logger.debug(f"  Earnings fields populated: {earn_count}/13")
+    
+    return phase1_data
+