Add data preparation scripts for UK Biobank analysis

- Introduced `prepare_data.R` for merging disease and other data from CSV files. - Added `prepare_data.py` for processing UK Biobank data, including: - Mapping field IDs to human-readable names. - Handling date variables and converting them to offsets. - Processing disease events and constructing tabular features. - Splitting data into training, validation, and test sets. - Saving processed data to binary and CSV formats.
2025-12-04 11:26:49 +08:00
parent d48c62466f
commit 9ca8909e3a
8 changed files with 5420 additions and 0 deletions
--- a/prepare_data.R
+++ b/prepare_data.R
@@ -0,0 +1,26 @@
+library(data.table)
+setDTthreads(40)
+library(readr)
+field_id <- read.csv("field_id.txt", header = FALSE)
+uid <- field_id$V1
+big_path   <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HHdata_221103_0512.csv"
+header_dt  <- fread(big_path, nrows = 0)     # 只读 0 行 ⇒ 只有列名
+all_names  <- names(header_dt)
+keep_names <- intersect(all_names,uid)
+ukb_disease <- fread(big_path,
+                 select     = keep_names,
+                 showProgress = TRUE)
+
+field_id <- read.csv("field_id.txt", header = FALSE)
+uid <- field_id$V1
+big_path <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HH_data_220812_0512.csv"
+header_dt  <- fread(big_path, nrows = 0)     # 只读 0 行 ⇒ 只有列名
+all_names  <- names(header_dt)
+keep_names <- intersect(all_names,uid)
+ukb_others <- fread(big_path,
+                 select     = keep_names,
+                 showProgress = TRUE)
+
+# merge disease and other data by "eid"
+ukb_data <- merge(ukb_disease, ukb_others, by = "eid", all = TRUE)
+fwrite(ukb_data, "ukb_data.csv")