import pandas as pd
from pathlib import Path
from IPython.display import Markdown, HTML
from tulip.plots import plot_bar, plot_lines
from tulip_mania.notebook_health import (
analyze_notebooks,
generate_health_report,
get_failed_notebooks,
get_notebooks_by_data_source,
get_latest_build_log,
merge_with_build_log,
get_notebook_link,
get_project_notebooks,
)
from tulip_mania.columns import columnsNotebook Health Monitor¶
This notebook analyzes the health of all notebooks in the documentation, including:
Execution status (success, error, not run)
Error analysis and diagnostics
Notebook inventory and complexity metrics
Data source usage (Bloomberg, Haver, FRED, GS)
# Configuration
# Detect project root - works whether run from projects/other/ or from project root
cwd = Path.cwd()
# Check if we're in the project root (has projects/ folder) or in projects/other/
if (cwd / "projects").is_dir():
# Running from project root (e.g., via main.py)
PROJECT_ROOT = cwd
elif cwd.name == "other" and (cwd.parent.parent / "projects").is_dir():
# Running from projects/other/ (interactive)
PROJECT_ROOT = cwd.parent.parent
else:
# Fallback: try to find project root by looking for myst.yml
PROJECT_ROOT = cwd
for parent in [cwd] + list(cwd.parents):
if (parent / "myst.yml").exists():
PROJECT_ROOT = parent
break
PROJECTS_PATH = PROJECT_ROOT / "projects"
LOG_DIR = PROJECT_ROOT / "logs"
print(f"Project root: {PROJECT_ROOT.resolve()}")
print(f"Analyzing notebooks in: {PROJECTS_PATH.resolve()}")
# Run analysis using the project-aware function
df = get_project_notebooks(PROJECT_ROOT)
df = merge_with_build_log(df, LOG_DIR)
report = generate_health_report(df)
# Get last build info
build_log = get_latest_build_log(LOG_DIR)
if build_log:
print(f"Last build: {build_log['log_date'].strftime('%Y-%m-%d %H:%M')}")
print(f"Build results: {build_log['summary']['passed']}/{build_log['summary']['total']} passed")
print(f"Analyzed {report['total_notebooks']} notebooks")Summary Dashboard¶
# Summary statistics
partial_count = report["status_counts"].get("partial", 0)
read_error_count = report["status_counts"].get("read_error", 0)
summary_html = f"""
<div style="display: flex; gap: 20px; flex-wrap: wrap;">
<div style="background: #e8f5e9; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #2e7d32;">Total</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["total_notebooks"]}</p>
<p style="margin: 0; color: #666;">notebooks</p>
</div>
<div style="background: #e3f2fd; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #1565c0;">Success</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["status_counts"].get("success", 0)}</p>
<p style="margin: 0; color: #666;">{report["success_rate"]:.1f}%</p>
</div>
<div style="background: #ffebee; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #c62828;">Errors</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["status_counts"].get("error", 0)}</p>
<p style="margin: 0; color: #666;">{report["error_rate"]:.1f}%</p>
</div>
<div style="background: #ffe0b2; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #e65100;">Partial</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{partial_count}</p>
<p style="margin: 0; color: #666;">some cells ran</p>
</div>
<div style="background: #fff3e0; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #ef6c00;">Not Run</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["status_counts"].get("not_run", 0)}</p>
<p style="margin: 0; color: #666;">notebooks</p>
</div>
</div>
"""
# Show read errors if any
if read_error_count > 0:
summary_html += f"""
<div style="margin-top: 10px; background: #f5f5f5; padding: 10px; border-radius: 8px;">
<span style="color: #666;">Read errors: {read_error_count} notebooks could not be parsed</span>
</div>
"""
HTML(summary_html)# Execution status breakdown
status_df = pd.DataFrame(
[
{"Status": status, "Count": count}
for status, count in report["status_counts"].items()
]
).set_index("Status")
plot_bar(
status_df["Count"],
title="<b>Notebook Execution Status</b>",
figsize=(600, 400),
)Error Analysis¶
# Failed notebooks with clickable links
failed = get_failed_notebooks(df)
location_col = "project" if "project" in df.columns else "folder"
if failed.empty:
display(Markdown("No failed notebooks found."))
else:
display(Markdown(f"### Failed Notebooks ({len(failed)})"))
# Create clickable links - adjust for project structure
failed_display = failed.copy()
def make_link(row):
"""Create a relative link to the notebook."""
path = Path(row["path"])
# Get path relative to projects folder
try:
rel_path = path.relative_to(PROJECTS_PATH)
html_path = str(rel_path).replace("\\", "/").replace(".ipynb", ".html")
return f'<a href="../{html_path}">{path.name}</a>'
except ValueError:
return path.name
failed_display["notebook"] = failed_display.apply(make_link, axis=1)
# Use location_col if present in failed_display
display_cols = ["notebook"]
if location_col in failed_display.columns:
display_cols.append(location_col)
elif "folder" in failed_display.columns:
display_cols.append("folder")
display_cols.extend(["error_cell_index", "error_type", "error_message"])
# Filter to only existing columns
display_cols = [c for c in display_cols if c in failed_display.columns]
display(HTML(
failed_display[display_cols].to_html(escape=False, index=False)
))# Error types breakdown
if report["error_types"]:
display(Markdown("### Error Types"))
error_df = pd.DataFrame(
[
{"Error Type": etype, "Count": count}
for etype, count in report["error_types"].items()
]
)
display(error_df.style.hide(axis="index"))# Partial execution notebooks
location_col = "project" if "project" in df.columns else "folder"
display_cols = ["name", location_col, "cells_executed", "code_cells"]
display_cols = [c for c in display_cols if c in df.columns]
partial_nbs = df[df["execution_status"] == "partial"][display_cols]
if partial_nbs.empty:
display(Markdown("No partially executed notebooks found."))
else:
display(Markdown(f"### Partially Executed Notebooks ({len(partial_nbs)})"))
display(
Markdown(
"_These notebooks started execution but didn't complete all code cells._"
)
)
display(
partial_nbs.style.set_properties(**{"text-align": "left"}).hide(axis="index")
)Notebook Inventory¶
# Notebooks by project folder
group_col = "project" if "project" in df.columns else "folder"
folder_df = (
df.groupby(group_col)
.agg(
{
"name": "count",
"execution_status": lambda x: (x == "success").sum(),
"code_cells": "mean",
"lines_of_code": "mean",
}
)
.rename(
columns={
"name": "Notebooks",
"execution_status": "Successful",
"code_cells": "Avg Code Cells",
"lines_of_code": "Avg Lines",
}
)
)
display(Markdown(f"### Notebooks by {group_col.title()}"))
folder_df.style.format(
{
"Avg Code Cells": "{:.1f}",
"Avg Lines": "{:.0f}",
}
)# Full inventory table with build times
display(Markdown("### Full Inventory"))
# Use project column if available, otherwise folder
location_col = "project" if "project" in df.columns else "folder"
inventory_cols = [
"name",
location_col,
"execution_status",
"build_time",
"code_cells",
"lines_of_code",
"uses_bloomberg",
"uses_haver",
"uses_iris",
]
# Filter to only columns that exist
inventory_cols = [c for c in inventory_cols if c in df.columns]
def status_color(val):
colors = {
"success": "background-color: #c8e6c9",
"error": "background-color: #ffcdd2",
"not_run": "background-color: #fff9c4",
"partial": "background-color: #ffe0b2",
}
return colors.get(val, "")
# Format build time as seconds
inv_df = df[inventory_cols].copy()
if "build_time" in inv_df.columns:
inv_df["build_time"] = inv_df["build_time"].apply(
lambda x: f"{x:.1f}s" if pd.notna(x) and isinstance(x, (int, float)) else "-"
)
inv_df.style.map(status_color, subset=["execution_status"]).hide(axis="index")Complexity Metrics¶
# Complexity summary
complexity_html = f"""
<div style="display: flex; gap: 20px; flex-wrap: wrap;">
<div style="background: #f3e5f5; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #7b1fa2;">Avg Cells</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["avg_cells"]:.1f}</p>
<p style="margin: 0; color: #666;">per notebook</p>
</div>
<div style="background: #e1f5fe; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #0277bd;">Avg Code Cells</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["avg_code_cells"]:.1f}</p>
<p style="margin: 0; color: #666;">per notebook</p>
</div>
<div style="background: #fce4ec; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #c2185b;">Avg Lines</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["avg_lines_of_code"]:.0f}</p>
<p style="margin: 0; color: #666;">of code</p>
</div>
<div style="background: #e8eaf6; padding: 20px; border-radius: 8px; min-width: 150px;">
<h3 style="margin: 0; color: #3f51b5;">With Timestamp</h3>
<p style="font-size: 32px; margin: 10px 0; font-weight: bold;">{report["notebooks_with_timestamp"]}</p>
<p style="margin: 0; color: #666;">notebooks</p>
</div>
</div>
"""
HTML(complexity_html)# Largest notebooks by lines of code
display(Markdown("### Largest Notebooks (by lines of code)"))
location_col = "project" if "project" in df.columns else "folder"
df.nlargest(10, "lines_of_code")[
["name", location_col, "lines_of_code", "code_cells"]
].style.hide(axis="index")# Slowest notebooks by build time
display(Markdown("### Slowest Notebooks (by build time)"))
# Ensure build_time is numeric for sorting
df["build_time"] = pd.to_numeric(df["build_time"], errors="coerce")
if df["build_time"].notna().any():
slowest = df[df["build_time"].notna()].nlargest(10, "build_time")[
["name", "folder", "build_time", "code_cells"]
].copy()
slowest["build_time"] = slowest["build_time"].apply(lambda x: f"{x:.1f}s")
display(slowest.style.hide(axis="index"))
else:
display(Markdown("_No build time data available. Run a full build to populate._"))Data Source Usage¶
# Data source usage chart
ds_df = pd.DataFrame(
[
{"Source": source.title(), "Notebooks": count}
for source, count in report["data_source_usage"].items()
]
).set_index("Source")
plot_bar(
ds_df["Notebooks"],
title="<b>Data Source Usage Across Notebooks</b>",
figsize=(600, 400),
)# Notebooks by data source
Markdown("### Notebooks by Data Source")bloomberg_nbs = get_notebooks_by_data_source(df, "bloomberg")
display(Markdown(f"**Bloomberg** ({len(bloomberg_nbs)} notebooks)"))
if not bloomberg_nbs.empty:
display(bloomberg_nbs.head(10).style.hide(axis="index"))haver_nbs = get_notebooks_by_data_source(df, "haver")
display(Markdown(f"**Haver** ({len(haver_nbs)} notebooks)"))
if not haver_nbs.empty:
display(haver_nbs.head(10).style.hide(axis="index"))from tulip_mania.notebook_related import notebook_updated
notebook_updated()