From 27d1e2be10ecaabe8cb75739960750b79c51110a Mon Sep 17 00:00:00 2001 From: b3nw Date: Thu, 21 May 2026 14:46:08 +0000 Subject: [PATCH] fix: resolve report_url=None for blob-URL tickers and fix dataclass serialization When Schwab uses modern blob URLs (increasingly common), find_report() returns __CLICK_TO_OPEN__ and the scraper skips storing a report_url even though the PDF downloads and parses successfully. This caused agents to see report_url=None for tickers like PEP/BR/DPZ/MSCI/BMI. Changes: - Fix serialize() to use dataclasses.asdict() instead of str() for dataclass payloads, producing proper JSON objects instead of Python repr strings - Add /reports/{ticker}/pdf endpoint to serve cached Morningstar PDFs - Enrich report_url with the MCP's own PDF endpoint when blob URLs were used and the report was successfully downloaded - Add SCHWAB_MCP_BASE_URL env var to compose for self-referential URLs --- compose.yaml | 1 + server.py | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/compose.yaml b/compose.yaml index 7d5d3ac..553b882 100644 --- a/compose.yaml +++ b/compose.yaml @@ -22,6 +22,7 @@ services: cpus: '0.1' environment: - SCHWAB_PLAYWRIGHT_URL=ws://browser.local.ben.io:3000/playwright/chromium?timeout=300000 + - SCHWAB_MCP_BASE_URL=https://schwab-mcp.ext.ben.io - PORT=8000 volumes: - ./cookies.json:/app/cookies.json diff --git a/server.py b/server.py index 0b738b3..39d354c 100644 --- a/server.py +++ b/server.py @@ -1,3 +1,4 @@ +import dataclasses import io import json import logging @@ -9,11 +10,12 @@ from typing import Optional, Any, Tuple from fastmcp import FastMCP from starlette.applications import Starlette -from starlette.responses import JSONResponse +from starlette.responses import JSONResponse, Response from starlette.routing import Route, Mount import uvicorn import schwab_scraper.unified_api as api +from schwab_scraper.storage.cache import read_cached_pdf # --------------------------------------------------------------------------- @@ -236,18 +238,25 @@ login_manager = LoginManager() mcp = FastMCP("SchwabScraper") +def _json_default(obj: Any) -> Any: + """JSON fallback handler that converts dataclasses to dicts before str().""" + if dataclasses.is_dataclass(obj) and not isinstance(obj, type): + return dataclasses.asdict(obj) + return str(obj) + + def serialize(obj: Any) -> str: """Safely serialize Pydantic models or dataclasses to JSON string.""" if hasattr(obj, "model_dump_json"): return obj.model_dump_json() elif hasattr(obj, "model_dump"): - return json.dumps(obj.model_dump(), default=str) + return json.dumps(obj.model_dump(), default=_json_default) elif isinstance(obj, list): return json.dumps([ o.model_dump() if hasattr(o, "model_dump") else o for o in obj - ], default=str) - return json.dumps(obj, default=str) + ], default=_json_default) + return json.dumps(obj, default=_json_default) # --------------------------------------------------------------------------- @@ -457,6 +466,21 @@ async def get_morningstar_data(ticker: str, debug: bool = False) -> str: debug: Enable debug logging """ result = await api.get_morningstar_data(ticker, debug=debug) + + # When the scraper used blob URLs (modern Schwab web components), report_url + # is None even though the PDF was downloaded and parsed successfully. Point + # callers at the MCP server's cached-PDF endpoint instead. + if ( + isinstance(result, dict) + and result.get("success") + and result.get("data") is not None + ): + data = result["data"] + if hasattr(data, "report_url") and data.report_url is None and data.source is not None: + base = os.getenv("SCHWAB_MCP_BASE_URL", "").rstrip("/") + if base: + data.report_url = f"{base}/reports/{ticker.upper()}/pdf" + return serialize(result) @@ -527,10 +551,27 @@ async def health(request): return JSONResponse({"status": "ok"}) +async def serve_report_pdf(request): + """Serve a cached Morningstar report PDF by ticker.""" + ticker = request.path_params["ticker"].upper() + pdf_bytes = read_cached_pdf(ticker) + if not pdf_bytes: + return JSONResponse( + {"error": f"No cached report for {ticker}. Call get_morningstar_data first."}, + status_code=404, + ) + return Response( + pdf_bytes, + media_type="application/pdf", + headers={"Content-Disposition": f'inline; filename="{ticker}_morningstar.pdf"'}, + ) + + mcp_app = mcp.http_app() app = Starlette( routes=[ Route("/health", health), + Route("/reports/{ticker}/pdf", serve_report_pdf), Mount("/", app=mcp_app), ], lifespan=mcp_app.lifespan,