Add data preparation scripts for UK Biobank analysis

- Introduced `prepare_data.R` for merging disease and other data from CSV files.
- Added `prepare_data.py` for processing UK Biobank data, including:
  - Mapping field IDs to human-readable names.
  - Handling date variables and converting them to offsets.
  - Processing disease events and constructing tabular features.
  - Splitting data into training, validation, and test sets.
  - Saving processed data to binary and CSV formats.
This commit is contained in:
2025-12-04 11:26:49 +08:00
parent d48c62466f
commit 9ca8909e3a
8 changed files with 5420 additions and 0 deletions

1270
delphi_fork/labels.csv Normal file

File diff suppressed because it is too large Load Diff

216
delphi_fork/prepare_data.py Normal file
View File

@@ -0,0 +1,216 @@
import pandas as pd # Pandas for data manipulation
import tqdm # Progress bar for chunk processing
import numpy as np # Numerical operations
train_frac = 0.7 # Fraction of participants for training split
val_frac = 0.15 # Fraction of participants for validation split
test_frac = 0.15 # Fraction of participants for test split
# CSV mapping field IDs to human-readable names
field_map_file = "../field_ids_enriched.csv"
field_dict = {} # Map original field ID -> new column name
with open(field_map_file, "r", encoding="utf-8") as f: # Open the field mapping file
next(f) # skip header line
for line in f: # Iterate through lines
parts = line.strip().split(",") # Split by CSV commas
if len(parts) >= 3: # Ensure we have at least id and name columns (fix: was >=2)
# Original field identifier (e.g., "34-0.0")
field_id = parts[0]
field_name = parts[2] # Human-readable column name
field_dict[field_id] = field_name # Record the mapping
# Track as a potential tabular feature
# TSV mapping field IDs to ICD10-related date columns
field_to_icd_map = "../icd10_codes_mod.tsv"
# Date-like variables to be converted to offsets
date_vars = []
with open(field_to_icd_map, "r", encoding="utf-8") as f: # Open ICD10 mapping
for line in f: # Iterate each mapping row
parts = line.strip().split() # Split on whitespace for TSV
if len(parts) >= 6: # Guard against malformed lines
# Map field ID to the date column name
field_dict[parts[0]] = parts[5]
date_vars.append(parts[5]) # Track date column names in order
for j in range(17): # Map up to 17 cancer entry slots (dates and types)
# Cancer diagnosis date slot j
field_dict[f'40005-{j}.0'] = f'cancer_date_{j}'
field_dict[f'40006-{j}.0'] = f'cancer_type_{j}' # Cancer type/code slot j
# Number of ICD-related date columns before adding extras
len_icd = len(date_vars)
date_vars.extend(['Death', 'date_of_assessment'] + # Add outcome date and assessment date
# Add cancer date columns
[f'cancer_date_{j}' for j in range(17)])
labels_file = "labels.csv" # File listing label codes
label_dict = {} # Map code string -> integer label id
with open(labels_file, "r", encoding="utf-8") as f: # Open labels file
for idx, line in enumerate(f): # Enumerate to assign incremental label IDs
parts = line.strip().split(' ') # Split by space
if parts and parts[0]: # Guard against empty lines
label_dict[parts[0]] = idx
event_list = [] # Accumulator for event arrays across chunks
ukb_iterator = pd.read_csv( # Stream UK Biobank data in chunks
"../ukb_data.csv",
sep=',',
chunksize=10000, # Stream file in manageable chunks to reduce memory footprint
# First column (participant ID) becomes DataFrame index
index_col=0,
low_memory=False # Disable type inference optimization for consistent dtypes
)
# Iterate chunks with progress
for ukb_chunk in tqdm.tqdm(ukb_iterator, desc="Processing UK Biobank data"):
# Rename columns to friendly names
ukb_chunk = ukb_chunk.rename(columns=field_dict)
# Require sex to be present
ukb_chunk.dropna(subset=['sex'], inplace=True)
ukb_chunk['sex'] += 2 # Recode sex: 0-> 2, 1 -> 3
# Construct date of birth from year and month (day fixed to 1)
ukb_chunk['dob'] = pd.to_datetime(
# Guard against malformed dates
ukb_chunk[['year', 'month']].assign(DAY=1), errors='coerce'
)
# Use only date variables that actually exist in the current chunk
present_date_vars = [c for c in date_vars if c in ukb_chunk.columns]
# Convert date-like columns to datetime and compute day offsets from dob
if present_date_vars:
date_cols = ukb_chunk[present_date_vars].apply(
pd.to_datetime, format="%Y-%m-%d", errors='coerce' # Parse dates safely
)
date_cols_days = date_cols.sub(
ukb_chunk['dob'], axis=0) # Timedelta relative to dob
ukb_chunk[present_date_vars] = date_cols_days.apply(
lambda x: x.dt.days) # Store days since dob
# Process disease events from ICD10-related date columns
# Take ICD date cols plus 'Death' if present by order
icd10_cols = present_date_vars[:len_icd + 1]
# Melt to long form: participant id, event code (column name), and days offset
melted_df = ukb_chunk.reset_index().melt(
id_vars=['eid'],
value_vars=icd10_cols,
var_name='event_code',
value_name='days',
)
# Require non-missing day offsets
melted_df.dropna(subset=['days'], inplace=True)
if not melted_df.empty:
melted_df['label'] = melted_df['event_code'].map(
label_dict) # Map event code to numeric label
# Fix: ensure labels exist before int cast
melted_df.dropna(subset=['label'], inplace=True)
if not melted_df.empty:
event_list.append(
melted_df[['eid', 'days', 'label']]
.astype(int) # Safe now since label and days are non-null
.to_numpy()
)
df_res = ukb_chunk.reset_index() # Bring participant ID out of index
# Simplify stub names for wide_to_long
# Rename date stubs
rename_dict = {f'cancer_date_{j}': f'cancerdate{j}' for j in range(17)}
rename_dict.update(
# Rename type stubs
{f'cancer_type_{j}': f'cancertype{j}' for j in range(17)})
df_renamed = df_res.rename(columns=rename_dict) # Apply renaming
stubs_to_use = [] # Collect available stubs
if any('cancerdate' in col for col in df_renamed.columns):
stubs_to_use.append('cancerdate') # Date stub present
if any('cancertype' in col for col in df_renamed.columns):
stubs_to_use.append('cancertype') # Type stub present
if len(stubs_to_use) == 2: # Only proceed if both date and type columns exist
long_cancer = pd.wide_to_long(
df_renamed,
stubnames=stubs_to_use,
i=['eid'], # Participant ID identifier
j='cancer_num' # Index over cancer record number (0..16)
).dropna() # Remove rows missing either date or type
if not long_cancer.empty:
long_cancer['cancer'] = long_cancer['cancertype'].str.slice(
0, 3) # Use first 3 chars as code
long_cancer['cancer_label'] = long_cancer['cancer'].map(
label_dict) # Map to label id
cancer_array = (
long_cancer.reset_index(
)[['eid', 'cancerdate', 'cancer_label']]
.dropna()
.astype(int)
.to_numpy()
)
if cancer_array.size > 0:
event_list.append(cancer_array) # Append cancer events
# Process BMI, smoking, and alcohol status
ukb_bmi = ukb_chunk[['date_of_assessment', 'bmi']].dropna().reset_index()
if not ukb_bmi.empty:
ukb_bmi['bmi_status'] = np.select(
[ukb_bmi['bmi'] > 28, ukb_bmi['bmi'] > 22],
[6, 5],
default=4
)
event_list.append(
ukb_bmi[['eid', 'date_of_assessment', 'bmi_status']]
.astype(int)
.to_numpy()
)
ukb_sm = ukb_chunk[['date_of_assessment', 'smoking']].dropna().reset_index()
ukb_sm = ukb_sm[ukb_sm['smoking'] != -3] # Exclude unknown smoking status
if not ukb_sm.empty:
ukb_sm['smoking_status'] = np.select(
[ukb_sm['smoking'] == 1, ukb_sm['smoking'] == 2],
[9, 8],
default=7
)
event_list.append(
ukb_sm[['eid', 'date_of_assessment', 'smoking_status']]
.astype(int)
.to_numpy()
)
ukb_al = ukb_chunk[['date_of_assessment', 'alcohol']].dropna().reset_index()
ukb_al = ukb_al[ukb_al['alcohol'] != -3] # Exclude unknown alcohol status
if not ukb_al.empty:
ukb_al['alcohol_status'] = np.select(
[ukb_al['alcohol'] == 1, ukb_al['alcohol'] < 4],
[12, 11],
default=10
)
event_list.append(
ukb_al[['eid', 'date_of_assessment', 'alcohol_status']]
.astype(int)
.to_numpy()
)
# Combine tabular chunks
data = np.vstack(event_list) # Stack all event arrays into one
# Sort by participant then day
data = data[np.lexsort((data[:, 1], data[:, 0]))]
# Keep only events with non-negative day offsets
data = data[data[:, 1] >= 0]
# Remove duplicate (participant_id, label) pairs keeping first occurrence.
data = pd.DataFrame(data).drop_duplicates([0, 2]).values
# Store compactly using unsigned 32-bit integers
data = data.astype(np.uint32)
# Split data into train/val/test based on unique participant IDs
unique_ids = np.unique(data[:, 0]) # Unique participant IDs
train_split_id = unique_ids[int(len(unique_ids) * train_frac)]
val_split_id = unique_ids[int(len(unique_ids) * (train_frac + val_frac))]
train_data = data[data[:, 0] <= train_split_id].tofile("ukb_real_train.bin")
val_data = data[(data[:, 0] > train_split_id) & (
data[:, 0] <= val_split_id)].tofile("ukb_real_val.bin")
test_data = data[data[:, 0] > val_split_id].tofile("ukb_real_test.bin")

1237
field_id.txt Normal file

File diff suppressed because it is too large Load Diff

74
field_ids_enriched.csv Normal file
View File

@@ -0,0 +1,74 @@
field_instance,full_name,var_name
31-0.0,Sex,sex
34-0.0,Year of birth,year
48-0.0,Waist circumference,waist_circumference
49-0.0,Hip circumference,hip_circumference
50-0.0,Standing height,standing_height
52-0.0,Month of birth,month
53-0.0,Date of attending assessment centre,date_of_assessment
74-0.0,Fasting time,fasting_time
102-0.0,Pulse rate automated reading,pulse_rate
1239-0.0,Current tobacco smoking,smoking
1558-0.0,Alcohol intake frequency.,alcohol
4079-0.0,Diastolic blood pressure automated reading,dbp
4080-0.0,Systolic blood pressure automated reading,sbp
20150-0.0,Forced expiratory volume in 1-second (FEV1) Best measure,fev1_best
20151-0.0,Forced vital capacity (FVC) Best measure,fvc_best
20258-0.0,FEV1/ FVC ratio Z-score,fev1_fvc_ratio
21001-0.0,Body mass index (BMI),bmi
21003-0.0,Age when attended assessment centre,age_at_assessment
30000-0.0,White blood cell (leukocyte) count,WBC
30010-0.0,Red blood cell (erythrocyte) count,RBC
30020-0.0,Haemoglobin concentration,hemoglobin
30030-0.0,Haematocrit percentage,hematocrit
30040-0.0,Mean corpuscular volume,MCV
30050-0.0,Mean corpuscular haemoglobin,MCH
30060-0.0,Mean corpuscular haemoglobin concentration,MCHC
30080-0.0,Platelet count,Pc
30100-0.0,Mean platelet (thrombocyte) volume,MPV
30120-0.0,Lymphocyte count,LymC
30130-0.0,Monocyte count,MonC
30140-0.0,Neutrophill count,NeuC
30150-0.0,Eosinophill count,EosC
30160-0.0,Basophill count,BasC
30170-0.0,Nucleated red blood cell count,nRBC
30250-0.0,Reticulocyte count,RC
30260-0.0,Mean reticulocyte volume,MRV
30270-0.0,Mean sphered cell volume,MSCV
30280-0.0,Immature reticulocyte fraction,IRF
30300-0.0,High light scatter reticulocyte count,HLSRC
30500-0.0,Microalbumin in urine,MicU
30510-0.0,Creatinine (enzymatic) in urine,CreaU
30520-0.0,Potassium in urine,PotU
30530-0.0,Sodium in urine,SodU
30600-0.0,Albumin,Alb
30610-0.0,Alkaline phosphatase,ALP
30620-0.0,Alanine aminotransferase,Alanine
30630-0.0,Apolipoprotein A,ApoA
30640-0.0,Apolipoprotein B,ApoB
30650-0.0,Aspartate aminotransferase,AA
30660-0.0,Direct bilirubin,DBil
30670-0.0,Urea,Urea
30680-0.0,Calcium,Calcium
30690-0.0,Cholesterol,Cholesterol
30700-0.0,Creatinine,Creatinine
30710-0.0,C-reactive protein,CRP
30720-0.0,Cystatin C,CystatinC
30730-0.0,Gamma glutamyltransferase,GGT
30740-0.0,Glucose,Glu
30750-0.0,Glycated haemoglobin (HbA1c),HbA1c
30760-0.0,HDL cholesterol,HDL
30770-0.0,IGF-1,IGF1
30780-0.0,LDL direct,LDL
30790-0.0,Lipoprotein A,LpA
30800-0.0,Oestradiol,Oestradiol
30810-0.0,Phosphate,Phosphate
30820-0.0,Rheumatoid factor,Rheu
30830-0.0,SHBG,SHBG
30840-0.0,Total bilirubin,TotalBil
30850-0.0,Testosterone,Testosterone
30860-0.0,Total protein,TotalProtein
30870-0.0,Triglycerides,Tri
30880-0.0,Urate,Urate
30890-0.0,Vitamin D,VitaminD
40000-0.0,Date of death,Death
1 field_instance full_name var_name
2 31-0.0 Sex sex
3 34-0.0 Year of birth year
4 48-0.0 Waist circumference waist_circumference
5 49-0.0 Hip circumference hip_circumference
6 50-0.0 Standing height standing_height
7 52-0.0 Month of birth month
8 53-0.0 Date of attending assessment centre date_of_assessment
9 74-0.0 Fasting time fasting_time
10 102-0.0 Pulse rate automated reading pulse_rate
11 1239-0.0 Current tobacco smoking smoking
12 1558-0.0 Alcohol intake frequency. alcohol
13 4079-0.0 Diastolic blood pressure automated reading dbp
14 4080-0.0 Systolic blood pressure automated reading sbp
15 20150-0.0 Forced expiratory volume in 1-second (FEV1) Best measure fev1_best
16 20151-0.0 Forced vital capacity (FVC) Best measure fvc_best
17 20258-0.0 FEV1/ FVC ratio Z-score fev1_fvc_ratio
18 21001-0.0 Body mass index (BMI) bmi
19 21003-0.0 Age when attended assessment centre age_at_assessment
20 30000-0.0 White blood cell (leukocyte) count WBC
21 30010-0.0 Red blood cell (erythrocyte) count RBC
22 30020-0.0 Haemoglobin concentration hemoglobin
23 30030-0.0 Haematocrit percentage hematocrit
24 30040-0.0 Mean corpuscular volume MCV
25 30050-0.0 Mean corpuscular haemoglobin MCH
26 30060-0.0 Mean corpuscular haemoglobin concentration MCHC
27 30080-0.0 Platelet count Pc
28 30100-0.0 Mean platelet (thrombocyte) volume MPV
29 30120-0.0 Lymphocyte count LymC
30 30130-0.0 Monocyte count MonC
31 30140-0.0 Neutrophill count NeuC
32 30150-0.0 Eosinophill count EosC
33 30160-0.0 Basophill count BasC
34 30170-0.0 Nucleated red blood cell count nRBC
35 30250-0.0 Reticulocyte count RC
36 30260-0.0 Mean reticulocyte volume MRV
37 30270-0.0 Mean sphered cell volume MSCV
38 30280-0.0 Immature reticulocyte fraction IRF
39 30300-0.0 High light scatter reticulocyte count HLSRC
40 30500-0.0 Microalbumin in urine MicU
41 30510-0.0 Creatinine (enzymatic) in urine CreaU
42 30520-0.0 Potassium in urine PotU
43 30530-0.0 Sodium in urine SodU
44 30600-0.0 Albumin Alb
45 30610-0.0 Alkaline phosphatase ALP
46 30620-0.0 Alanine aminotransferase Alanine
47 30630-0.0 Apolipoprotein A ApoA
48 30640-0.0 Apolipoprotein B ApoB
49 30650-0.0 Aspartate aminotransferase AA
50 30660-0.0 Direct bilirubin DBil
51 30670-0.0 Urea Urea
52 30680-0.0 Calcium Calcium
53 30690-0.0 Cholesterol Cholesterol
54 30700-0.0 Creatinine Creatinine
55 30710-0.0 C-reactive protein CRP
56 30720-0.0 Cystatin C CystatinC
57 30730-0.0 Gamma glutamyltransferase GGT
58 30740-0.0 Glucose Glu
59 30750-0.0 Glycated haemoglobin (HbA1c) HbA1c
60 30760-0.0 HDL cholesterol HDL
61 30770-0.0 IGF-1 IGF1
62 30780-0.0 LDL direct LDL
63 30790-0.0 Lipoprotein A LpA
64 30800-0.0 Oestradiol Oestradiol
65 30810-0.0 Phosphate Phosphate
66 30820-0.0 Rheumatoid factor Rheu
67 30830-0.0 SHBG SHBG
68 30840-0.0 Total bilirubin TotalBil
69 30850-0.0 Testosterone Testosterone
70 30860-0.0 Total protein TotalProtein
71 30870-0.0 Triglycerides Tri
72 30880-0.0 Urate Urate
73 30890-0.0 Vitamin D VitaminD
74 40000-0.0 Date of death Death

1129
icd10_codes_mod.tsv Normal file

File diff suppressed because it is too large Load Diff

1257
labels.csv Normal file

File diff suppressed because it is too large Load Diff

26
prepare_data.R Normal file
View File

@@ -0,0 +1,26 @@
library(data.table)
setDTthreads(40)
library(readr)
field_id <- read.csv("field_id.txt", header = FALSE)
uid <- field_id$V1
big_path <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HHdata_221103_0512.csv"
header_dt <- fread(big_path, nrows = 0) # 只读 0 行 ⇒ 只有列名
all_names <- names(header_dt)
keep_names <- intersect(all_names,uid)
ukb_disease <- fread(big_path,
select = keep_names,
showProgress = TRUE)
field_id <- read.csv("field_id.txt", header = FALSE)
uid <- field_id$V1
big_path <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HH_data_220812_0512.csv"
header_dt <- fread(big_path, nrows = 0) # 只读 0 行 ⇒ 只有列名
all_names <- names(header_dt)
keep_names <- intersect(all_names,uid)
ukb_others <- fread(big_path,
select = keep_names,
showProgress = TRUE)
# merge disease and other data by "eid"
ukb_data <- merge(ukb_disease, ukb_others, by = "eid", all = TRUE)
fwrite(ukb_data, "ukb_data.csv")

211
prepare_data.py Normal file
View File

@@ -0,0 +1,211 @@
import pandas as pd # Pandas for data manipulation
import tqdm # Progress bar for chunk processing
import numpy as np # Numerical operations
train_frac = 0.7 # Fraction of participants for training split
val_frac = 0.15 # Fraction of participants for validation split
test_frac = 0.15 # Fraction of participants for test split
# CSV mapping field IDs to human-readable names
field_map_file = "field_ids_enriched.csv"
field_dict = {} # Map original field ID -> new column name
tabular_fields = [] # List of tabular feature column names
with open(field_map_file, "r", encoding="utf-8") as f: # Open the field mapping file
next(f) # skip header line
for line in f: # Iterate through lines
parts = line.strip().split(",") # Split by CSV commas
if len(parts) >= 3: # Ensure we have at least id and name columns (fix: was >=2)
# Original field identifier (e.g., "34-0.0")
field_id = parts[0]
field_name = parts[2] # Human-readable column name
field_dict[field_id] = field_name # Record the mapping
# Track as a potential tabular feature
tabular_fields.append(field_name)
# Exclude raw date parts and target columns
exclude_fields = ['year', 'month', 'Death', 'age_at_assessment']
tabular_fields = [
# Filter out excluded columns
field for field in tabular_fields if field not in exclude_fields]
# TSV mapping field IDs to ICD10-related date columns
field_to_icd_map = "icd10_codes_mod.tsv"
# Date-like variables to be converted to offsets
date_vars = []
with open(field_to_icd_map, "r", encoding="utf-8") as f: # Open ICD10 mapping
for line in f: # Iterate each mapping row
parts = line.strip().split() # Split on whitespace for TSV
if len(parts) >= 6: # Guard against malformed lines
# Map field ID to the date column name
field_dict[parts[0]] = parts[5]
date_vars.append(parts[5]) # Track date column names in order
for j in range(17): # Map up to 17 cancer entry slots (dates and types)
# Cancer diagnosis date slot j
field_dict[f'40005-{j}.0'] = f'cancer_date_{j}'
field_dict[f'40006-{j}.0'] = f'cancer_type_{j}' # Cancer type/code slot j
# Number of ICD-related date columns before adding extras
len_icd = len(date_vars)
date_vars.extend(['Death', 'date_of_assessment'] + # Add outcome date and assessment date
# Add cancer date columns
[f'cancer_date_{j}' for j in range(17)])
labels_file = "labels.csv" # File listing label codes
label_dict = {} # Map code string -> integer label id
with open(labels_file, "r", encoding="utf-8") as f: # Open labels file
for idx, line in enumerate(f): # Enumerate to assign incremental label IDs
parts = line.strip().split(' ') # Split by space
if parts and parts[0]: # Guard against empty lines
# Map code to index (0 for padding, 1 for checkup)
label_dict[parts[0]] = idx + 2
event_list = [] # Accumulator for event arrays across chunks
tabular_list = [] # Accumulator for tabular feature DataFrames across chunks
ukb_iterator = pd.read_csv( # Stream UK Biobank data in chunks
"ukb_data.csv",
sep=',',
chunksize=10000, # Stream file in manageable chunks to reduce memory footprint
# First column (participant ID) becomes DataFrame index
index_col=0,
low_memory=False # Disable type inference optimization for consistent dtypes
)
# Iterate chunks with progress
for ukb_chunk in tqdm.tqdm(ukb_iterator, desc="Processing UK Biobank data"):
# Rename columns to friendly names
ukb_chunk = ukb_chunk.rename(columns=field_dict)
# Require sex to be present
ukb_chunk.dropna(subset=['sex'], inplace=True)
# Construct date of birth from year and month (day fixed to 1)
ukb_chunk['dob'] = pd.to_datetime(
# Guard against malformed dates
ukb_chunk[['year', 'month']].assign(DAY=1), errors='coerce'
)
# Use only date variables that actually exist in the current chunk
present_date_vars = [c for c in date_vars if c in ukb_chunk.columns]
# Convert date-like columns to datetime and compute day offsets from dob
if present_date_vars:
date_cols = ukb_chunk[present_date_vars].apply(
pd.to_datetime, format="%Y-%m-%d", errors='coerce' # Parse dates safely
)
date_cols_days = date_cols.sub(
ukb_chunk['dob'], axis=0) # Timedelta relative to dob
ukb_chunk[present_date_vars] = date_cols_days.apply(
lambda x: x.dt.days) # Store days since dob
ukb_chunk = ukb_chunk.convert_dtypes()
# Append tabular features (use only columns that exist)
present_tabular_fields = [
c for c in tabular_fields if c in ukb_chunk.columns]
tabular_list.append(ukb_chunk[present_tabular_fields].copy())
# Process disease events from ICD10-related date columns
# Take ICD date cols plus 'Death' if present by order
icd10_cols = present_date_vars[:len_icd + 1]
# Melt to long form: participant id, event code (column name), and days offset
melted_df = ukb_chunk.reset_index().melt(
id_vars=['eid'],
value_vars=icd10_cols,
var_name='event_code',
value_name='days',
)
# Require non-missing day offsets
melted_df.dropna(subset=['days'], inplace=True)
if not melted_df.empty:
melted_df['label'] = melted_df['event_code'].map(
label_dict) # Map event code to numeric label
# Fix: ensure labels exist before int cast
melted_df.dropna(subset=['label'], inplace=True)
if not melted_df.empty:
event_list.append(
melted_df[['eid', 'days', 'label']]
.astype(int) # Safe now since label and days are non-null
.to_numpy()
)
# Add assesment date as a "checkup" event (label=1)
if 'date_of_assessment' in ukb_chunk.columns:
assessment_array = (
ukb_chunk.reset_index()[['eid', 'date_of_assessment']]
.dropna()
.assign(label=1) # Checkup label
.astype(int)
.to_numpy()
)
if assessment_array.size > 0:
event_list.append(assessment_array) # Append checkup events
df_res = ukb_chunk.reset_index() # Bring participant ID out of index
# Simplify stub names for wide_to_long
# Rename date stubs
rename_dict = {f'cancer_date_{j}': f'cancerdate{j}' for j in range(17)}
rename_dict.update(
# Rename type stubs
{f'cancer_type_{j}': f'cancertype{j}' for j in range(17)})
df_renamed = df_res.rename(columns=rename_dict) # Apply renaming
stubs_to_use = [] # Collect available stubs
if any('cancerdate' in col for col in df_renamed.columns):
stubs_to_use.append('cancerdate') # Date stub present
if any('cancertype' in col for col in df_renamed.columns):
stubs_to_use.append('cancertype') # Type stub present
if len(stubs_to_use) == 2: # Only proceed if both date and type columns exist
long_cancer = pd.wide_to_long(
df_renamed,
stubnames=stubs_to_use,
i=['eid'], # Participant ID identifier
j='cancer_num' # Index over cancer record number (0..16)
).dropna() # Remove rows missing either date or type
if not long_cancer.empty:
long_cancer['cancer'] = long_cancer['cancertype'].str.slice(
0, 3) # Use first 3 chars as code
long_cancer['cancer_label'] = long_cancer['cancer'].map(
label_dict) # Map to label id
cancer_array = (
long_cancer.reset_index(
)[['eid', 'cancerdate', 'cancer_label']]
.dropna()
.astype(int)
.to_numpy()
)
if cancer_array.size > 0:
event_list.append(cancer_array) # Append cancer events
# Combine tabular chunks
final_tabular = pd.concat(tabular_list, axis=0, ignore_index=False)
final_tabular.index.name = 'eid' # Ensure index named consistently
data = np.vstack(event_list) # Stack all event arrays into one
# Sort by participant then day
data = data[np.lexsort((data[:, 1], data[:, 0]))]
# Keep only events with non-negative day offsets
data = data[data[:, 1] >= 0]
# Remove duplicate (participant_id, label) pairs keeping first occurrence.
data = pd.DataFrame(data).drop_duplicates([0, 2]).values
# Store compactly using unsigned 32-bit integers
data = data.astype(np.uint32)
# Split data into train/val/test sets by participant ID
unique_ids = np.unique(data[:, 0]) # Unique participant IDs
# ID cutoff for train
train_split_id = unique_ids[int(len(unique_ids) * train_frac)]
# ID cutoff for val
val_split_id = unique_ids[int(len(unique_ids) * (train_frac + val_frac))]
train_data = data[data[:, 0] <= train_split_id].tofile("ukb_train.bin")
val_data = data[(data[:, 0] > train_split_id) & (
data[:, 0] <= val_split_id)].tofile("ukb_val.bin")
test_data = data[data[:, 0] > val_split_id].tofile("ukb_test.bin")
train_tabular = final_tabular[final_tabular.index <= train_split_id]
val_tabular = final_tabular[(final_tabular.index > train_split_id) & (
final_tabular.index <= val_split_id)]
test_tabular = final_tabular[final_tabular.index > val_split_id]
train_tabular.to_csv("ukb_train_tabular.csv")
val_tabular.to_csv("ukb_val_tabular.csv")
test_tabular.to_csv("ukb_test_tabular.csv")