R: order ICD-10 chapters by chapter number (Roman numeral prefix) in boxplots; handle 'Death' and unknowns at end

This commit is contained in:
2025-10-22 16:08:00 +08:00
parent b954b4b3e7
commit a81da36657

View File

@@ -15,7 +15,7 @@ args <- commandArgs(trailingOnly = TRUE)
one_year_csv <- if (length(args) >= 1) args[1] else "model_comparison_auc_1year.csv"
no_gap_csv <- if (length(args) >= 2) args[2] else "model_comparison_auc_no_gap.csv"
out_dir <- if (length(args) >= 3) args[3] else "."
orientation <- if (length(args) >= 4) tolower(args[4]) else "horizontal" # "horizontal" (flipped) or "vertical"
orientation <- if (length(args) >= 4) tolower(args[4]) else "vertical" # "horizontal" (flipped) or "vertical"
if (!dir.exists(out_dir)) {
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
@@ -38,6 +38,29 @@ get_chapter_col <- function(df) {
return(NA_character_)
}
# Compute a deterministic chapter ordering using the ICD-10 chapter numeral prefix
# e.g., "I. Infectious Diseases", "II. Neoplasms", ..., "XVII. ...", with a fallback for "Death" and unknowns
compute_chapter_levels <- function(chapters) {
ch <- as.character(chapters)
roman_levels <- c(
"I","II","III","IV","V","VI","VII","VIII","IX","X",
"XI","XII","XIII","XIV","XV","XVI","XVII","XVIII","XIX","XX"
)
roman_map <- setNames(seq_along(roman_levels), roman_levels)
# Extract leading Roman numeral before a dot, like "XVI." -> "XVI"
roman <- toupper(gsub("^\\s*([IVXLCDM]+)\\..*$", "\\1", ch))
idx <- rep(NA_integer_, length(ch))
hit <- roman %in% names(roman_map)
idx[hit] <- roman_map[roman[hit]]
# Special-case Death at the end
idx[grepl("^\\s*Death\\b", ch, ignore.case = TRUE)] <- 99L
# Unknowns to the very end
idx[is.na(idx)] <- 100L
# Order chapters by idx, stable within same idx by appearance
o <- order(idx, match(ch, unique(ch)))
unique(ch[o])
}
# Build long-format data.frame with columns: chapter, model, auc
# It will include any of the known model columns that exist in the input df
build_long_df <- function(df) {
@@ -85,14 +108,8 @@ build_long_df <- function(df) {
# Make the boxplot grouped by chapter
make_boxplot <- function(long_df, title_text, flip = TRUE) {
# Order chapters by median AUC of Delphi if available, otherwise overall median
has_delphi <- any(long_df$model == "Delphi")
if (has_delphi) {
med <- aggregate(auc ~ chapter, data = subset(long_df, model == "Delphi"), median, na.rm = TRUE)
} else {
med <- aggregate(auc ~ chapter, data = long_df, median, na.rm = TRUE)
}
chap_levels <- med[order(med$auc, decreasing = TRUE), "chapter"]
# Order chapters by their ICD-10 chapter number prefix (Roman numerals)
chap_levels <- compute_chapter_levels(long_df$chapter)
long_df$chapter <- factor(long_df$chapter, levels = chap_levels)
p <- ggplot(long_df, aes(x = chapter, y = auc, fill = model)) +