fix: resolve report_url=None for blob-URL tickers and fix dataclass serialization
All checks were successful
Build and Push Docker Image / build (push) Successful in 1m4s

When Schwab uses modern blob URLs (increasingly common), find_report()
returns __CLICK_TO_OPEN__ and the scraper skips storing a report_url
even though the PDF downloads and parses successfully. This caused
agents to see report_url=None for tickers like PEP/BR/DPZ/MSCI/BMI.

Changes:
- Fix serialize() to use dataclasses.asdict() instead of str() for
  dataclass payloads, producing proper JSON objects instead of Python
  repr strings
- Add /reports/{ticker}/pdf endpoint to serve cached Morningstar PDFs
- Enrich report_url with the MCP's own PDF endpoint when blob URLs
  were used and the report was successfully downloaded
- Add SCHWAB_MCP_BASE_URL env var to compose for self-referential URLs
This commit is contained in:
2026-05-21 14:46:08 +00:00
parent 0e048a1e08
commit 27d1e2be10
2 changed files with 46 additions and 4 deletions

View File

@@ -22,6 +22,7 @@ services:
cpus: '0.1'
environment:
- SCHWAB_PLAYWRIGHT_URL=ws://browser.local.ben.io:3000/playwright/chromium?timeout=300000
- SCHWAB_MCP_BASE_URL=https://schwab-mcp.ext.ben.io
- PORT=8000
volumes:
- ./cookies.json:/app/cookies.json

View File

@@ -1,3 +1,4 @@
import dataclasses
import io
import json
import logging
@@ -9,11 +10,12 @@ from typing import Optional, Any, Tuple
from fastmcp import FastMCP
from starlette.applications import Starlette
from starlette.responses import JSONResponse
from starlette.responses import JSONResponse, Response
from starlette.routing import Route, Mount
import uvicorn
import schwab_scraper.unified_api as api
from schwab_scraper.storage.cache import read_cached_pdf
# ---------------------------------------------------------------------------
@@ -236,18 +238,25 @@ login_manager = LoginManager()
mcp = FastMCP("SchwabScraper")
def _json_default(obj: Any) -> Any:
"""JSON fallback handler that converts dataclasses to dicts before str()."""
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
return dataclasses.asdict(obj)
return str(obj)
def serialize(obj: Any) -> str:
"""Safely serialize Pydantic models or dataclasses to JSON string."""
if hasattr(obj, "model_dump_json"):
return obj.model_dump_json()
elif hasattr(obj, "model_dump"):
return json.dumps(obj.model_dump(), default=str)
return json.dumps(obj.model_dump(), default=_json_default)
elif isinstance(obj, list):
return json.dumps([
o.model_dump() if hasattr(o, "model_dump") else o
for o in obj
], default=str)
return json.dumps(obj, default=str)
], default=_json_default)
return json.dumps(obj, default=_json_default)
# ---------------------------------------------------------------------------
@@ -457,6 +466,21 @@ async def get_morningstar_data(ticker: str, debug: bool = False) -> str:
debug: Enable debug logging
"""
result = await api.get_morningstar_data(ticker, debug=debug)
# When the scraper used blob URLs (modern Schwab web components), report_url
# is None even though the PDF was downloaded and parsed successfully. Point
# callers at the MCP server's cached-PDF endpoint instead.
if (
isinstance(result, dict)
and result.get("success")
and result.get("data") is not None
):
data = result["data"]
if hasattr(data, "report_url") and data.report_url is None and data.source is not None:
base = os.getenv("SCHWAB_MCP_BASE_URL", "").rstrip("/")
if base:
data.report_url = f"{base}/reports/{ticker.upper()}/pdf"
return serialize(result)
@@ -527,10 +551,27 @@ async def health(request):
return JSONResponse({"status": "ok"})
async def serve_report_pdf(request):
"""Serve a cached Morningstar report PDF by ticker."""
ticker = request.path_params["ticker"].upper()
pdf_bytes = read_cached_pdf(ticker)
if not pdf_bytes:
return JSONResponse(
{"error": f"No cached report for {ticker}. Call get_morningstar_data first."},
status_code=404,
)
return Response(
pdf_bytes,
media_type="application/pdf",
headers={"Content-Disposition": f'inline; filename="{ticker}_morningstar.pdf"'},
)
mcp_app = mcp.http_app()
app = Starlette(
routes=[
Route("/health", health),
Route("/reports/{ticker}/pdf", serve_report_pdf),
Mount("/", app=mcp_app),
],
lifespan=mcp_app.lifespan,