Files
DeepHealth/prepare_data.R
Jiarui Li 9ca8909e3a Add data preparation scripts for UK Biobank analysis
- Introduced `prepare_data.R` for merging disease and other data from CSV files.
- Added `prepare_data.py` for processing UK Biobank data, including:
  - Mapping field IDs to human-readable names.
  - Handling date variables and converting them to offsets.
  - Processing disease events and constructing tabular features.
  - Splitting data into training, validation, and test sets.
  - Saving processed data to binary and CSV formats.
2025-12-04 11:26:49 +08:00

26 lines
1.0 KiB
R

library(data.table)
setDTthreads(40)
library(readr)
field_id <- read.csv("field_id.txt", header = FALSE)
uid <- field_id$V1
big_path <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HHdata_221103_0512.csv"
header_dt <- fread(big_path, nrows = 0) # 只读 0 行 ⇒ 只有列名
all_names <- names(header_dt)
keep_names <- intersect(all_names,uid)
ukb_disease <- fread(big_path,
select = keep_names,
showProgress = TRUE)
field_id <- read.csv("field_id.txt", header = FALSE)
uid <- field_id$V1
big_path <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HH_data_220812_0512.csv"
header_dt <- fread(big_path, nrows = 0) # 只读 0 行 ⇒ 只有列名
all_names <- names(header_dt)
keep_names <- intersect(all_names,uid)
ukb_others <- fread(big_path,
select = keep_names,
showProgress = TRUE)
# merge disease and other data by "eid"
ukb_data <- merge(ukb_disease, ukb_others, by = "eid", all = TRUE)
fwrite(ukb_data, "ukb_data.csv")