4  ANCOVA efficacy analysis

4.1 Setup

import sys
import os
# Add the python-package directory to Python path for demo001 access
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'python-package'))

import polars as pl
from datetime import datetime
import rtflite as rtf

# Import demo001 package functions
from demo001 import find_project_root, load_adam_dataset
from demo001.efficacy import (
    prepare_locf_data,
    calculate_descriptive_stats,
    perform_ancova,
    format_efficacy_table,
    format_comparison_table
)
polars.config.Config
# Load data using demo001 utility functions
project_root = find_project_root()
adsl = load_adam_dataset("adsl", project_root)
adlbc = load_adam_dataset("adlbc", project_root)
treatments = ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]

4.2 Step 1: Explore Laboratory Data Structure

We start by understanding the glucose data structure and endpoint definitions.

# Display key laboratory variables
# Key ADLBC variables for efficacy analysis
lab_vars = adlbc.select(["USUBJID", "PARAMCD", "PARAM", "AVISIT", "AVISITN", "AVAL", "BASE", "CHG"])
lab_vars
shape: (74_264, 8)
USUBJID PARAMCD PARAM AVISIT AVISITN AVAL BASE CHG
str str str str f64 f64 f64 f64
"01-701-1015" "SODIUM" "Sodium (mmol/L)" "        Baseline" 0.0 140.0 140.0 null
"01-701-1015" "K" "Potassium (mmol/L)" "        Baseline" 0.0 4.5 4.5 null
"01-701-1015" "CL" "Chloride (mmol/L)" "        Baseline" 0.0 106.0 106.0 null
"01-718-1427" "_CK" "Creatine Kinase (U/L) change f… "          Week 8" 8.0 -0.5 null null
"01-718-1427" "_CK" "Creatine Kinase (U/L) change f… "End of Treatment" 99.0 -0.5 null null
# Focus on glucose parameter
gluc_visits = adlbc.filter(pl.col("PARAMCD") == "GLUC").select("AVISIT", "AVISITN").unique().sort("AVISITN")
# Available glucose measurement visits
gluc_visits
shape: (12, 2)
AVISIT AVISITN
str f64
"               ." null
"        Baseline" 0.0
"          Week 2" 2.0
"         Week 26" 26.0
"End of Treatment" 99.0

4.3 Step 2: Define Analysis Population and Endpoint

Following the Statistical Analysis Plan, we focus on the efficacy population for the primary endpoint.

# Clean data types and prepare datasets
adlbc_clean = adlbc.with_columns([
    pl.col(c).cast(str).str.strip_chars()
    for c in ["USUBJID", "PARAMCD", "AVISIT", "TRTP"]
])

# Define efficacy population
adsl_eff = adsl.filter(pl.col("EFFFL") == "Y").select(["USUBJID"])
# Efficacy population size
adsl_eff.height

# Filter laboratory data to efficacy population
adlbc_eff = adlbc_clean.join(adsl_eff, on="USUBJID", how="inner")
# Laboratory records in efficacy population
adlbc_eff.height
71882
# Examine glucose data availability by visit and treatment
gluc_availability = (
    adlbc_eff.filter(pl.col("PARAMCD") == "GLUC")
    .group_by(["TRTP", "AVISIT"])
    .agg(n_subjects=pl.col("USUBJID").n_unique())
    .sort(["TRTP", "AVISIT"])
)
# Glucose data availability by visit
gluc_availability
shape: (36, 3)
TRTP AVISIT n_subjects
str str u32
"Placebo" "." 13
"Placebo" "Baseline" 79
"Placebo" "End of Treatment" 79
"Xanomeline Low Dose" "Week 6" 62
"Xanomeline Low Dose" "Week 8" 59

4.4 Step 3: Implement LOCF Imputation Strategy

We apply Last Observation Carried Forward (LOCF) for missing Week 24 glucose values using the demo001 efficacy functions.

# Prepare glucose data with LOCF using demo001 function
gluc_data = prepare_locf_data(
    adlbc=adlbc_eff,
    adsl_eff=adsl_eff,
    param="GLUC",
    endpoint_week=24
)

# Subjects with baseline and Week 24 (LOCF) glucose
gluc_data.height
# Sample of prepared analysis data
gluc_data
shape: (232, 7)
USUBJID TRTP BASE Baseline Week_24_LOCF Last_Visit CHG
str str f64 f64 f64 f64 f64
"01-701-1015" "Placebo" 4.71835 4.71835 4.49631 24.0 -0.22204
"01-701-1023" "Placebo" 5.32896 5.32896 5.43998 4.0 0.11102
"01-701-1028" "Xanomeline High Dose" 4.77386 4.77386 5.43998 24.0 0.66612
"01-718-1371" "Xanomeline High Dose" 6.27263 6.27263 5.10692 12.0 -1.16571
"01-718-1427" "Xanomeline High Dose" 4.55182 4.55182 3.71917 8.0 -0.83265
# Assess LOCF imputation impact
locf_summary = (
    gluc_data
    .group_by(["TRTP", "Last_Visit"])
    .agg(n_subjects=pl.len())
    .sort(["TRTP", "Last_Visit"])
)
# LOCF imputation summary (last actual visit used)
locf_summary
shape: (26, 3)
TRTP Last_Visit n_subjects
str f64 u32
"Placebo" 2.0 2
"Placebo" 4.0 2
"Placebo" 6.0 2
"Xanomeline Low Dose" 20.0 5
"Xanomeline Low Dose" 24.0 25

4.5 Step 4: Calculate Descriptive Statistics

We compute baseline, Week 24, and change from baseline statistics by treatment group using demo001 functions.

# Calculate comprehensive descriptive statistics using demo001 function
desc_stats = calculate_descriptive_stats(gluc_data, treatments)

# Display descriptive statistics
desc_df = pl.DataFrame(desc_stats)
# Descriptive statistics by treatment
desc_df
shape: (3, 8)
Treatment N_Analysis Baseline_Mean Baseline_SD Endpoint_Mean Endpoint_SD Change_Mean Change_SD
str i64 f64 f64 f64 f64 f64 f64
"Placebo" 79 5.656399 2.229324 5.639535 1.651807 -0.016864 2.31841
"Xanomeline Low Dose" 79 5.419603 0.946102 5.352148 1.058004 -0.067455 1.015715
"Xanomeline High Dose" 74 5.388971 1.374893 5.831551 2.214668 0.44258 1.645179

4.6 Step 5: Perform ANCOVA Analysis

We fit the ANCOVA model with treatment and baseline glucose as covariates using demo001 functions.

# Perform ANCOVA analysis using demo001 function
ancova_results = perform_ancova(gluc_data, treatments)

# Extract results
model = ancova_results["model"]
ls_means = ancova_results["ls_means"]
comparisons = ancova_results["comparisons"]

# Display model summary
print(f"R-squared: {model.rsquared:.4f}")
print(f"F-statistic: {model.fvalue:.2f}, p-value: {model.f_pvalue:.4f}")

# Display LS means
ls_means_df = pl.DataFrame(ls_means)
ls_means_df
R-squared: 0.2552
F-statistic: 26.04, p-value: 0.0000
shape: (3, 5)
Treatment LS_Mean SE CI_Lower CI_Upper
str f64 f64 f64 f64
"Placebo" 0.071554 0.171563 -0.264709 0.407818
"Xanomeline Low Dose" -0.105215 0.171308 -0.440977 0.230548
"Xanomeline High Dose" 0.388498 0.177055 0.041471 0.735525

4.7 Step 6: Pairwise Treatment Comparisons

We calculate treatment differences and their statistical significance (already calculated in the ANCOVA results).

# Display comparison results from ANCOVA
comparison_df = pl.DataFrame(comparisons)
comparison_df
shape: (2, 7)
Comparison Estimate SE CI_Lower CI_Upper t_stat p_value
str f64 f64 f64 f64 f64 f64
"Xanomeline Low Dose vs. Placeb… -0.176769 0.242635 -0.654862 0.301324 -0.728539 0.467031
"Xanomeline High Dose vs. Place… 0.316943 0.246806 -0.169369 0.803256 1.284179 0.200383

4.8 Step 7: Prepare Tables for RTF Output

We format the analysis results into publication-ready tables using demo001 functions.

# Format efficacy table using demo001 function
tbl1 = format_efficacy_table(desc_stats, ls_means)
tbl1
shape: (3, 8)
Treatment N_Base Mean_SD_Base N_End Mean_SD_End N_Chg Mean_SD_Chg LS_Mean_CI
str str str str str str str str
"Placebo" "79" "5.7 (2.23)" "79" "5.6 (1.65)" "79" "-0.0 (2.32)" "0.07 (-0.26, 0.41)"
"Xanomeline Low Dose" "79" "5.4 (0.95)" "79" "5.4 (1.06)" "79" "-0.1 (1.02)" "-0.11 (-0.44, 0.23)"
"Xanomeline High Dose" "74" "5.4 (1.37)" "74" "5.8 (2.21)" "74" "0.4 (1.65)" "0.39 (0.04, 0.74)"
# Format comparison table using demo001 function
tbl2 = format_comparison_table(comparisons)
tbl2
shape: (2, 3)
Comparison Diff_CI P_Value
str str str
"Xanomeline Low Dose vs. Placeb… "-0.18 (-0.65, 0.30)" "0.4670"
"Xanomeline High Dose vs. Place… "0.32 (-0.17, 0.80)" "0.2004"

4.9 Step 8: Create Regulatory-Compliant RTF Document

We generate a comprehensive efficacy table following regulatory submission standards.

# Create comprehensive RTF document with multiple table sections
doc_ancova = rtf.RTFDocument(
    df=[tbl1, tbl2],
    rtf_title=rtf.RTFTitle(
        text=[
            "Analysis of Covariance (ANCOVA) of Change from Baseline in",
            "Fasting Glucose (mmol/L) at Week 24 (LOCF)",
            "Efficacy Analysis Population"
        ]
    ),
    rtf_column_header=[
        # Header for descriptive statistics table
        [
            rtf.RTFColumnHeader(
                text=["", "Baseline", "Week 24 (LOCF)", "Change from Baseline", ""],
                col_rel_width=[3, 2, 2, 3, 2],
                text_justification=["l", "c", "c", "c", "c"],
                text_format="b"
            ),
            rtf.RTFColumnHeader(
                text=[
                    "Treatment Group",
                    "N", "Mean (SD)",
                    "N", "Mean (SD)",
                    "N", "Mean (SD)",
                    "LS Mean (95% CI){^a}"
                ],
                col_rel_width=[3, 0.7, 1.3, 0.7, 1.3, 0.7, 1.3, 2],
                text_justification=["l"] + ["c"] * 7,
                border_bottom="single",
                text_format="b"
            )
        ],
        # Header for pairwise comparisons table
        [
            rtf.RTFColumnHeader(
                text=[
                    "Pairwise Comparison",
                    "Difference in LS Mean (95% CI){^a}",
                    "p-Value{^b}"
                ],
                col_rel_width=[5, 4, 2],
                text_justification=["l", "c", "c"],
                text_format="b",
                border_bottom="single"
            )
        ]
    ],
    rtf_body=[
        # Body for descriptive statistics
        rtf.RTFBody(
            col_rel_width=[3, 0.7, 1.3, 0.7, 1.3, 0.7, 1.3, 2],
            text_justification=["l"] + ["c"] * 7
        ),
        # Body for pairwise comparisons
        rtf.RTFBody(
            col_rel_width=[5, 4, 2],
            text_justification=["l", "c", "c"]
        )
    ],
    rtf_footnote=rtf.RTFFootnote(
        text=[
            "{^a}LS means and differences in LS means are based on an ANCOVA model with treatment and baseline glucose as covariates.",
            f"{{^b}}p-values are from the ANCOVA model testing treatment effects (overall F-test p-value: {model.f_pvalue:.4f}).",
            "LOCF (Last Observation Carried Forward) approach is used for missing Week 24 values.",
            "ANCOVA = Analysis of Covariance; CI = Confidence Interval; LS = Least Squares; SD = Standard Deviation"
        ]
    ),
    rtf_source=rtf.RTFSource(
        text=[
            "Source: ADLBC Analysis Dataset",
            f"Analysis conducted: {datetime.now().strftime('%d%b%Y').upper()}",
            "Statistical software: Python (statsmodels)"
        ]
    )
)

# Generate RTF file
doc_ancova.write_rtf(project_root / "output" / "tlf_efficacy_ancova.rtf")
/home/runner/work/demo-py-esub/demo-py-esub/output/tlf_efficacy_ancova.rtf