4 ANCOVA efficacy analysis

4.1 Setup

import sys
import os
# Add the python-package directory to Python path for demo001 access
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'python-package'))

import polars as pl
from datetime import datetime
import rtflite as rtf

# Import demo001 package functions
from demo001 import find_project_root, load_adam_dataset
from demo001.efficacy import (
    prepare_locf_data,
    calculate_descriptive_stats,
    perform_ancova,
    format_efficacy_table,
    format_comparison_table
)

polars.config.Config

# Load data using demo001 utility functions
project_root = find_project_root()
adsl = load_adam_dataset("adsl", project_root)
adlbc = load_adam_dataset("adlbc", project_root)
treatments = ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]

4.2 Step 1: Explore Laboratory Data Structure

We start by understanding the glucose data structure and endpoint definitions.

# Display key laboratory variables
# Key ADLBC variables for efficacy analysis
lab_vars = adlbc.select(["USUBJID", "PARAMCD", "PARAM", "AVISIT", "AVISITN", "AVAL", "BASE", "CHG"])
lab_vars

shape: (74_264, 8)

USUBJID	PARAMCD	PARAM	AVISIT	AVISITN	AVAL	BASE	CHG
str	str	str	str	f64	f64	f64	f64
"01-701-1015"	"SODIUM"	"Sodium (mmol/L)"	" Baseline"	0.0	140.0	140.0	null
"01-701-1015"	"K"	"Potassium (mmol/L)"	" Baseline"	0.0	4.5	4.5	null
"01-701-1015"	"CL"	"Chloride (mmol/L)"	" Baseline"	0.0	106.0	106.0	null
…	…	…	…	…	…	…	…
"01-718-1427"	"_CK"	"Creatine Kinase (U/L) change f…	" Week 8"	8.0	-0.5	null	null
"01-718-1427"	"_CK"	"Creatine Kinase (U/L) change f…	"End of Treatment"	99.0	-0.5	null	null

# Focus on glucose parameter
gluc_visits = adlbc.filter(pl.col("PARAMCD") == "GLUC").select("AVISIT", "AVISITN").unique().sort("AVISITN")
# Available glucose measurement visits
gluc_visits

shape: (12, 2)

AVISIT	AVISITN
str	f64
" ."	null
" Baseline"	0.0
" Week 2"	2.0
…	…
" Week 26"	26.0
"End of Treatment"	99.0

4.3 Step 2: Define Analysis Population and Endpoint

Following the Statistical Analysis Plan, we focus on the efficacy population for the primary endpoint.

# Clean data types and prepare datasets
adlbc_clean = adlbc.with_columns([
    pl.col(c).cast(str).str.strip_chars()
    for c in ["USUBJID", "PARAMCD", "AVISIT", "TRTP"]
])

# Define efficacy population
adsl_eff = adsl.filter(pl.col("EFFFL") == "Y").select(["USUBJID"])
# Efficacy population size
adsl_eff.height

# Filter laboratory data to efficacy population
adlbc_eff = adlbc_clean.join(adsl_eff, on="USUBJID", how="inner")
# Laboratory records in efficacy population
adlbc_eff.height

# Examine glucose data availability by visit and treatment
gluc_availability = (
    adlbc_eff.filter(pl.col("PARAMCD") == "GLUC")
    .group_by(["TRTP", "AVISIT"])
    .agg(n_subjects=pl.col("USUBJID").n_unique())
    .sort(["TRTP", "AVISIT"])
)
# Glucose data availability by visit
gluc_availability

shape: (36, 3)

TRTP	AVISIT	n_subjects
str	str	u32
"Placebo"	"."	13
"Placebo"	"Baseline"	79
"Placebo"	"End of Treatment"	79
…	…	…
"Xanomeline Low Dose"	"Week 6"	62
"Xanomeline Low Dose"	"Week 8"	59

4.4 Step 3: Implement LOCF Imputation Strategy

We apply Last Observation Carried Forward (LOCF) for missing Week 24 glucose values using the demo001 efficacy functions.

# Prepare glucose data with LOCF using demo001 function
gluc_data = prepare_locf_data(
    adlbc=adlbc_eff,
    adsl_eff=adsl_eff,
    param="GLUC",
    endpoint_week=24
)

# Subjects with baseline and Week 24 (LOCF) glucose
gluc_data.height
# Sample of prepared analysis data
gluc_data

shape: (232, 7)

USUBJID	TRTP	BASE	Baseline	Week_24_LOCF	Last_Visit	CHG
str	str	f64	f64	f64	f64	f64
"01-701-1015"	"Placebo"	4.71835	4.71835	4.49631	24.0	-0.22204
"01-701-1023"	"Placebo"	5.32896	5.32896	5.43998	4.0	0.11102
"01-701-1028"	"Xanomeline High Dose"	4.77386	4.77386	5.43998	24.0	0.66612
…	…	…	…	…	…	…
"01-718-1371"	"Xanomeline High Dose"	6.27263	6.27263	5.10692	12.0	-1.16571
"01-718-1427"	"Xanomeline High Dose"	4.55182	4.55182	3.71917	8.0	-0.83265

# Assess LOCF imputation impact
locf_summary = (
    gluc_data
    .group_by(["TRTP", "Last_Visit"])
    .agg(n_subjects=pl.len())
    .sort(["TRTP", "Last_Visit"])
)
# LOCF imputation summary (last actual visit used)
locf_summary

shape: (26, 3)

TRTP	Last_Visit	n_subjects
str	f64	u32
"Placebo"	2.0	2
"Placebo"	4.0	2
"Placebo"	6.0	2
…	…	…
"Xanomeline Low Dose"	20.0	5
"Xanomeline Low Dose"	24.0	25

4.5 Step 4: Calculate Descriptive Statistics

We compute baseline, Week 24, and change from baseline statistics by treatment group using demo001 functions.

# Calculate comprehensive descriptive statistics using demo001 function
desc_stats = calculate_descriptive_stats(gluc_data, treatments)

# Display descriptive statistics
desc_df = pl.DataFrame(desc_stats)
# Descriptive statistics by treatment
desc_df

shape: (3, 8)

Treatment	N_Analysis	Baseline_Mean	Baseline_SD	Endpoint_Mean	Endpoint_SD	Change_Mean	Change_SD
str	i64	f64	f64	f64	f64	f64	f64
"Placebo"	79	5.656399	2.229324	5.639535	1.651807	-0.016864	2.31841
"Xanomeline Low Dose"	79	5.419603	0.946102	5.352148	1.058004	-0.067455	1.015715
"Xanomeline High Dose"	74	5.388971	1.374893	5.831551	2.214668	0.44258	1.645179

4.6 Step 5: Perform ANCOVA Analysis

We fit the ANCOVA model with treatment and baseline glucose as covariates using demo001 functions.

# Perform ANCOVA analysis using demo001 function
ancova_results = perform_ancova(gluc_data, treatments)

# Extract results
model = ancova_results["model"]
ls_means = ancova_results["ls_means"]
comparisons = ancova_results["comparisons"]

# Display model summary
print(f"R-squared: {model.rsquared:.4f}")
print(f"F-statistic: {model.fvalue:.2f}, p-value: {model.f_pvalue:.4f}")

# Display LS means
ls_means_df = pl.DataFrame(ls_means)
ls_means_df

R-squared: 0.2552
F-statistic: 26.04, p-value: 0.0000

shape: (3, 5)

Treatment	LS_Mean	SE	CI_Lower	CI_Upper
str	f64	f64	f64	f64
"Placebo"	0.071554	0.171563	-0.264709	0.407818
"Xanomeline Low Dose"	-0.105215	0.171308	-0.440977	0.230548
"Xanomeline High Dose"	0.388498	0.177055	0.041471	0.735525

4.7 Step 6: Pairwise Treatment Comparisons

We calculate treatment differences and their statistical significance (already calculated in the ANCOVA results).

# Display comparison results from ANCOVA
comparison_df = pl.DataFrame(comparisons)
comparison_df

shape: (2, 7)

Comparison	Estimate	SE	CI_Lower	CI_Upper	t_stat	p_value
str	f64	f64	f64	f64	f64	f64
"Xanomeline Low Dose vs. Placeb…	-0.176769	0.242635	-0.654862	0.301324	-0.728539	0.467031
"Xanomeline High Dose vs. Place…	0.316943	0.246806	-0.169369	0.803256	1.284179	0.200383

4.8 Step 7: Prepare Tables for RTF Output

We format the analysis results into publication-ready tables using demo001 functions.

# Format efficacy table using demo001 function
tbl1 = format_efficacy_table(desc_stats, ls_means)
tbl1

shape: (3, 8)

Treatment	N_Base	Mean_SD_Base	N_End	Mean_SD_End	N_Chg	Mean_SD_Chg	LS_Mean_CI
str	str	str	str	str	str	str	str
"Placebo"	"79"	"5.7 (2.23)"	"79"	"5.6 (1.65)"	"79"	"-0.0 (2.32)"	"0.07 (-0.26, 0.41)"
"Xanomeline Low Dose"	"79"	"5.4 (0.95)"	"79"	"5.4 (1.06)"	"79"	"-0.1 (1.02)"	"-0.11 (-0.44, 0.23)"
"Xanomeline High Dose"	"74"	"5.4 (1.37)"	"74"	"5.8 (2.21)"	"74"	"0.4 (1.65)"	"0.39 (0.04, 0.74)"

# Format comparison table using demo001 function
tbl2 = format_comparison_table(comparisons)
tbl2

shape: (2, 3)

Comparison	Diff_CI	P_Value
str	str	str
"Xanomeline Low Dose vs. Placeb…	"-0.18 (-0.65, 0.30)"	"0.4670"
"Xanomeline High Dose vs. Place…	"0.32 (-0.17, 0.80)"	"0.2004"

4.9 Step 8: Create Regulatory-Compliant RTF Document

We generate a comprehensive efficacy table following regulatory submission standards.

# Create comprehensive RTF document with multiple table sections
doc_ancova = rtf.RTFDocument(
    df=[tbl1, tbl2],
    rtf_title=rtf.RTFTitle(
        text=[
            "Analysis of Covariance (ANCOVA) of Change from Baseline in",
            "Fasting Glucose (mmol/L) at Week 24 (LOCF)",
            "Efficacy Analysis Population"
        ]
    ),
    rtf_column_header=[
        # Header for descriptive statistics table
        [
            rtf.RTFColumnHeader(
                text=["", "Baseline", "Week 24 (LOCF)", "Change from Baseline", ""],
                col_rel_width=[3, 2, 2, 3, 2],
                text_justification=["l", "c", "c", "c", "c"],
                text_format="b"
            ),
            rtf.RTFColumnHeader(
                text=[
                    "Treatment Group",
                    "N", "Mean (SD)",
                    "N", "Mean (SD)",
                    "N", "Mean (SD)",
                    "LS Mean (95% CI){^a}"
                ],
                col_rel_width=[3, 0.7, 1.3, 0.7, 1.3, 0.7, 1.3, 2],
                text_justification=["l"] + ["c"] * 7,
                border_bottom="single",
                text_format="b"
            )
        ],
        # Header for pairwise comparisons table
        [
            rtf.RTFColumnHeader(
                text=[
                    "Pairwise Comparison",
                    "Difference in LS Mean (95% CI){^a}",
                    "p-Value{^b}"
                ],
                col_rel_width=[5, 4, 2],
                text_justification=["l", "c", "c"],
                text_format="b",
                border_bottom="single"
            )
        ]
    ],
    rtf_body=[
        # Body for descriptive statistics
        rtf.RTFBody(
            col_rel_width=[3, 0.7, 1.3, 0.7, 1.3, 0.7, 1.3, 2],
            text_justification=["l"] + ["c"] * 7
        ),
        # Body for pairwise comparisons
        rtf.RTFBody(
            col_rel_width=[5, 4, 2],
            text_justification=["l", "c", "c"]
        )
    ],
    rtf_footnote=rtf.RTFFootnote(
        text=[
            "{^a}LS means and differences in LS means are based on an ANCOVA model with treatment and baseline glucose as covariates.",
            f"{{^b}}p-values are from the ANCOVA model testing treatment effects (overall F-test p-value: {model.f_pvalue:.4f}).",
            "LOCF (Last Observation Carried Forward) approach is used for missing Week 24 values.",
            "ANCOVA = Analysis of Covariance; CI = Confidence Interval; LS = Least Squares; SD = Standard Deviation"
        ]
    ),
    rtf_source=rtf.RTFSource(
        text=[
            "Source: ADLBC Analysis Dataset",
            f"Analysis conducted: {datetime.now().strftime('%d%b%Y').upper()}",
            "Statistical software: Python (statsmodels)"
        ]
    )
)

# Generate RTF file
doc_ancova.write_rtf(project_root / "output" / "tlf_efficacy_ancova.rtf")

/home/runner/work/demo-py-esub/demo-py-esub/output/tlf_efficacy_ancova.rtf