Add data preparation scripts for UK Biobank analysis

- Introduced `prepare_data.R` for merging disease and other data from CSV files. - Added `prepare_data.py` for processing UK Biobank data, including: - Mapping field IDs to human-readable names. - Handling date variables and converting them to offsets. - Processing disease events and constructing tabular features. - Splitting data into training, validation, and test sets. - Saving processed data to binary and CSV formats.
2025-12-04 11:26:49 +08:00
parent d48c62466f
commit 9ca8909e3a
8 changed files with 5420 additions and 0 deletions
--- a/delphi_fork/labels.csv
+++ b/delphi_fork/labels.csv
--- a/delphi_fork/prepare_data.py
+++ b/delphi_fork/prepare_data.py
@@ -0,0 +1,216 @@
+import pandas as pd               # Pandas for data manipulation
+import tqdm                       # Progress bar for chunk processing
+import numpy as np                # Numerical operations
+
+train_frac = 0.7                 # Fraction of participants for training split
+val_frac = 0.15                  # Fraction of participants for validation split
+test_frac = 0.15                 # Fraction of participants for test split
+
+# CSV mapping field IDs to human-readable names
+field_map_file = "../field_ids_enriched.csv"
+field_dict = {}                             # Map original field ID -> new column name
+with open(field_map_file, "r", encoding="utf-8") as f:  # Open the field mapping file
+    next(f)  # skip header line
+    for line in f:  # Iterate through lines
+        parts = line.strip().split(",")  # Split by CSV commas
+        if len(parts) >= 3:               # Ensure we have at least id and name columns (fix: was >=2)
+            # Original field identifier (e.g., "34-0.0")
+            field_id = parts[0]
+            field_name = parts[2]         # Human-readable column name
+            field_dict[field_id] = field_name  # Record the mapping
+            # Track as a potential tabular feature
+
+
+# TSV mapping field IDs to ICD10-related date columns
+field_to_icd_map = "../icd10_codes_mod.tsv"
+# Date-like variables to be converted to offsets
+date_vars = []
+with open(field_to_icd_map, "r", encoding="utf-8") as f:  # Open ICD10 mapping
+    for line in f:  # Iterate each mapping row
+        parts = line.strip().split()  # Split on whitespace for TSV
+        if len(parts) >= 6:           # Guard against malformed lines
+            # Map field ID to the date column name
+            field_dict[parts[0]] = parts[5]
+            date_vars.append(parts[5])       # Track date column names in order
+
+for j in range(17):                        # Map up to 17 cancer entry slots (dates and types)
+    # Cancer diagnosis date slot j
+    field_dict[f'40005-{j}.0'] = f'cancer_date_{j}'
+    field_dict[f'40006-{j}.0'] = f'cancer_type_{j}'  # Cancer type/code slot j
+
+# Number of ICD-related date columns before adding extras
+len_icd = len(date_vars)
+date_vars.extend(['Death', 'date_of_assessment'] +  # Add outcome date and assessment date
+                 # Add cancer date columns
+                 [f'cancer_date_{j}' for j in range(17)])
+
+labels_file = "labels.csv"  # File listing label codes
+label_dict = {}              # Map code string -> integer label id
+with open(labels_file, "r", encoding="utf-8") as f:  # Open labels file
+    for idx, line in enumerate(f):  # Enumerate to assign incremental label IDs
+        parts = line.strip().split(' ')  # Split by space
+        if parts and parts[0]:           # Guard against empty lines
+            label_dict[parts[0]] = idx
+
+event_list = []  # Accumulator for event arrays across chunks
+ukb_iterator = pd.read_csv(  # Stream UK Biobank data in chunks
+    "../ukb_data.csv",
+    sep=',',
+    chunksize=10000,          # Stream file in manageable chunks to reduce memory footprint
+    # First column (participant ID) becomes DataFrame index
+    index_col=0,
+    low_memory=False         # Disable type inference optimization for consistent dtypes
+)
+# Iterate chunks with progress
+for ukb_chunk in tqdm.tqdm(ukb_iterator, desc="Processing UK Biobank data"):
+    # Rename columns to friendly names
+    ukb_chunk = ukb_chunk.rename(columns=field_dict)
+    # Require sex to be present
+    ukb_chunk.dropna(subset=['sex'], inplace=True)
+    ukb_chunk['sex'] += 2  # Recode sex: 0-> 2, 1 -> 3
+
+    # Construct date of birth from year and month (day fixed to 1)
+    ukb_chunk['dob'] = pd.to_datetime(
+        # Guard against malformed dates
+        ukb_chunk[['year', 'month']].assign(DAY=1), errors='coerce'
+    )
+
+    # Use only date variables that actually exist in the current chunk
+    present_date_vars = [c for c in date_vars if c in ukb_chunk.columns]
+
+    # Convert date-like columns to datetime and compute day offsets from dob
+    if present_date_vars:
+        date_cols = ukb_chunk[present_date_vars].apply(
+            pd.to_datetime, format="%Y-%m-%d", errors='coerce'  # Parse dates safely
+        )
+        date_cols_days = date_cols.sub(
+            ukb_chunk['dob'], axis=0)   # Timedelta relative to dob
+        ukb_chunk[present_date_vars] = date_cols_days.apply(
+            lambda x: x.dt.days)  # Store days since dob
+
+    # Process disease events from ICD10-related date columns
+    # Take ICD date cols plus 'Death' if present by order
+    icd10_cols = present_date_vars[:len_icd + 1]
+    # Melt to long form: participant id, event code (column name), and days offset
+    melted_df = ukb_chunk.reset_index().melt(
+        id_vars=['eid'],
+        value_vars=icd10_cols,
+        var_name='event_code',
+        value_name='days',
+    )
+    # Require non-missing day offsets
+    melted_df.dropna(subset=['days'], inplace=True)
+    if not melted_df.empty:
+        melted_df['label'] = melted_df['event_code'].map(
+            label_dict)  # Map event code to numeric label
+        # Fix: ensure labels exist before int cast
+        melted_df.dropna(subset=['label'], inplace=True)
+        if not melted_df.empty:
+            event_list.append(
+                melted_df[['eid', 'days', 'label']]
+                .astype(int)  # Safe now since label and days are non-null
+                .to_numpy()
+            )
+
+    df_res = ukb_chunk.reset_index()  # Bring participant ID out of index
+    # Simplify stub names for wide_to_long
+    # Rename date stubs
+    rename_dict = {f'cancer_date_{j}': f'cancerdate{j}' for j in range(17)}
+    rename_dict.update(
+        # Rename type stubs
+        {f'cancer_type_{j}': f'cancertype{j}' for j in range(17)})
+    df_renamed = df_res.rename(columns=rename_dict)  # Apply renaming
+    stubs_to_use = []  # Collect available stubs
+    if any('cancerdate' in col for col in df_renamed.columns):
+        stubs_to_use.append('cancerdate')  # Date stub present
+    if any('cancertype' in col for col in df_renamed.columns):
+        stubs_to_use.append('cancertype')  # Type stub present
+
+    if len(stubs_to_use) == 2:  # Only proceed if both date and type columns exist
+        long_cancer = pd.wide_to_long(
+            df_renamed,
+            stubnames=stubs_to_use,
+            i=['eid'],        # Participant ID identifier
+            j='cancer_num'    # Index over cancer record number (0..16)
+        ).dropna()              # Remove rows missing either date or type
+        if not long_cancer.empty:
+            long_cancer['cancer'] = long_cancer['cancertype'].str.slice(
+                0, 3)       # Use first 3 chars as code
+            long_cancer['cancer_label'] = long_cancer['cancer'].map(
+                label_dict)     # Map to label id
+            cancer_array = (
+                long_cancer.reset_index(
+                )[['eid', 'cancerdate', 'cancer_label']]
+                .dropna()
+                .astype(int)
+                .to_numpy()
+            )
+            if cancer_array.size > 0:
+                event_list.append(cancer_array)  # Append cancer events
+
+    # Process BMI, smoking, and alcohol status
+    ukb_bmi = ukb_chunk[['date_of_assessment', 'bmi']].dropna().reset_index()
+    if not ukb_bmi.empty:
+        ukb_bmi['bmi_status'] = np.select(
+            [ukb_bmi['bmi'] > 28, ukb_bmi['bmi'] > 22],
+            [6, 5],
+            default=4
+        )
+        event_list.append(
+            ukb_bmi[['eid', 'date_of_assessment', 'bmi_status']]
+            .astype(int)
+            .to_numpy()
+        )
+
+    ukb_sm = ukb_chunk[['date_of_assessment', 'smoking']].dropna().reset_index()
+    ukb_sm = ukb_sm[ukb_sm['smoking'] != -3]  # Exclude unknown smoking status
+    if not ukb_sm.empty:
+        ukb_sm['smoking_status'] = np.select(
+            [ukb_sm['smoking'] == 1, ukb_sm['smoking'] == 2],
+            [9, 8],
+            default=7
+        )
+        event_list.append(
+            ukb_sm[['eid', 'date_of_assessment', 'smoking_status']]
+            .astype(int)
+            .to_numpy()
+        )
+    ukb_al = ukb_chunk[['date_of_assessment', 'alcohol']].dropna().reset_index()
+    ukb_al = ukb_al[ukb_al['alcohol'] != -3]  # Exclude unknown alcohol status
+    if not ukb_al.empty:
+        ukb_al['alcohol_status'] = np.select(
+            [ukb_al['alcohol'] == 1, ukb_al['alcohol'] < 4],
+            [12, 11],
+            default=10
+        )
+        event_list.append(
+            ukb_al[['eid', 'date_of_assessment', 'alcohol_status']]
+            .astype(int)
+            .to_numpy()
+        )
+
+# Combine tabular chunks
+
+data = np.vstack(event_list)      # Stack all event arrays into one
+
+# Sort by participant then day
+data = data[np.lexsort((data[:, 1], data[:, 0]))]
+
+# Keep only events with non-negative day offsets
+data = data[data[:, 1] >= 0]
+
+# Remove duplicate (participant_id, label) pairs keeping first occurrence.
+data = pd.DataFrame(data).drop_duplicates([0, 2]).values
+
+# Store compactly using unsigned 32-bit integers
+data = data.astype(np.uint32)
+
+# Split data into train/val/test based on unique participant IDs
+unique_ids = np.unique(data[:, 0])  # Unique participant IDs
+train_split_id = unique_ids[int(len(unique_ids) * train_frac)]
+val_split_id = unique_ids[int(len(unique_ids) * (train_frac + val_frac))]
+
+train_data = data[data[:, 0] <= train_split_id].tofile("ukb_real_train.bin")
+val_data = data[(data[:, 0] > train_split_id) & (
+    data[:, 0] <= val_split_id)].tofile("ukb_real_val.bin")
+test_data = data[data[:, 0] > val_split_id].tofile("ukb_real_test.bin")
--- a/field_id.txt
+++ b/field_id.txt
--- a/field_ids_enriched.csv
+++ b/field_ids_enriched.csv
@@ -0,0 +1,74 @@
+field_instance,full_name,var_name
+31-0.0,Sex,sex
+34-0.0,Year of birth,year
+48-0.0,Waist circumference,waist_circumference
+49-0.0,Hip circumference,hip_circumference
+50-0.0,Standing height,standing_height
+52-0.0,Month of birth,month
+53-0.0,Date of attending assessment centre,date_of_assessment
+74-0.0,Fasting time,fasting_time
+102-0.0,Pulse rate automated reading,pulse_rate
+1239-0.0,Current tobacco smoking,smoking
+1558-0.0,Alcohol intake frequency.,alcohol
+4079-0.0,Diastolic blood pressure automated reading,dbp
+4080-0.0,Systolic blood pressure automated reading,sbp
+20150-0.0,Forced expiratory volume in 1-second (FEV1) Best measure,fev1_best
+20151-0.0,Forced vital capacity (FVC) Best measure,fvc_best
+20258-0.0,FEV1/ FVC ratio Z-score,fev1_fvc_ratio
+21001-0.0,Body mass index (BMI),bmi
+21003-0.0,Age when attended assessment centre,age_at_assessment
+30000-0.0,White blood cell (leukocyte) count,WBC
+30010-0.0,Red blood cell (erythrocyte) count,RBC
+30020-0.0,Haemoglobin concentration,hemoglobin
+30030-0.0,Haematocrit percentage,hematocrit
+30040-0.0,Mean corpuscular volume,MCV
+30050-0.0,Mean corpuscular haemoglobin,MCH
+30060-0.0,Mean corpuscular haemoglobin concentration,MCHC
+30080-0.0,Platelet count,Pc
+30100-0.0,Mean platelet (thrombocyte) volume,MPV
+30120-0.0,Lymphocyte count,LymC
+30130-0.0,Monocyte count,MonC
+30140-0.0,Neutrophill count,NeuC
+30150-0.0,Eosinophill count,EosC
+30160-0.0,Basophill count,BasC
+30170-0.0,Nucleated red blood cell count,nRBC
+30250-0.0,Reticulocyte count,RC
+30260-0.0,Mean reticulocyte volume,MRV
+30270-0.0,Mean sphered cell volume,MSCV
+30280-0.0,Immature reticulocyte fraction,IRF
+30300-0.0,High light scatter reticulocyte count,HLSRC
+30500-0.0,Microalbumin in urine,MicU
+30510-0.0,Creatinine (enzymatic) in urine,CreaU
+30520-0.0,Potassium in urine,PotU
+30530-0.0,Sodium in urine,SodU
+30600-0.0,Albumin,Alb
+30610-0.0,Alkaline phosphatase,ALP
+30620-0.0,Alanine aminotransferase,Alanine
+30630-0.0,Apolipoprotein A,ApoA
+30640-0.0,Apolipoprotein B,ApoB
+30650-0.0,Aspartate aminotransferase,AA
+30660-0.0,Direct bilirubin,DBil
+30670-0.0,Urea,Urea
+30680-0.0,Calcium,Calcium
+30690-0.0,Cholesterol,Cholesterol
+30700-0.0,Creatinine,Creatinine
+30710-0.0,C-reactive protein,CRP
+30720-0.0,Cystatin C,CystatinC
+30730-0.0,Gamma glutamyltransferase,GGT
+30740-0.0,Glucose,Glu
+30750-0.0,Glycated haemoglobin (HbA1c),HbA1c
+30760-0.0,HDL cholesterol,HDL
+30770-0.0,IGF-1,IGF1
+30780-0.0,LDL direct,LDL
+30790-0.0,Lipoprotein A,LpA
+30800-0.0,Oestradiol,Oestradiol
+30810-0.0,Phosphate,Phosphate
+30820-0.0,Rheumatoid factor,Rheu
+30830-0.0,SHBG,SHBG
+30840-0.0,Total bilirubin,TotalBil
+30850-0.0,Testosterone,Testosterone
+30860-0.0,Total protein,TotalProtein
+30870-0.0,Triglycerides,Tri
+30880-0.0,Urate,Urate
+30890-0.0,Vitamin D,VitaminD
+40000-0.0,Date of death,Death
--- a/icd10_codes_mod.tsv
+++ b/icd10_codes_mod.tsv
--- a/labels.csv
+++ b/labels.csv
--- a/prepare_data.R
+++ b/prepare_data.R
@@ -0,0 +1,26 @@
+library(data.table)
+setDTthreads(40)
+library(readr)
+field_id <- read.csv("field_id.txt", header = FALSE)
+uid <- field_id$V1
+big_path   <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HHdata_221103_0512.csv"
+header_dt  <- fread(big_path, nrows = 0)     # 只读 0 行 ⇒ 只有列名
+all_names  <- names(header_dt)
+keep_names <- intersect(all_names,uid)
+ukb_disease <- fread(big_path,
+                 select     = keep_names,
+                 showProgress = TRUE)
+
+field_id <- read.csv("field_id.txt", header = FALSE)
+uid <- field_id$V1
+big_path <- "/mnt/storage/shared_data/UKBB/20230518-from-zhourong/HH_data_220812_0512.csv"
+header_dt  <- fread(big_path, nrows = 0)     # 只读 0 行 ⇒ 只有列名
+all_names  <- names(header_dt)
+keep_names <- intersect(all_names,uid)
+ukb_others <- fread(big_path,
+                 select     = keep_names,
+                 showProgress = TRUE)
+
+# merge disease and other data by "eid"
+ukb_data <- merge(ukb_disease, ukb_others, by = "eid", all = TRUE)
+fwrite(ukb_data, "ukb_data.csv")
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -0,0 +1,211 @@
+import pandas as pd               # Pandas for data manipulation
+import tqdm                       # Progress bar for chunk processing
+import numpy as np                # Numerical operations
+
+train_frac = 0.7                 # Fraction of participants for training split
+val_frac = 0.15                  # Fraction of participants for validation split
+test_frac = 0.15                 # Fraction of participants for test split
+
+# CSV mapping field IDs to human-readable names
+field_map_file = "field_ids_enriched.csv"
+field_dict = {}                             # Map original field ID -> new column name
+tabular_fields = []                         # List of tabular feature column names
+with open(field_map_file, "r", encoding="utf-8") as f:  # Open the field mapping file
+    next(f)  # skip header line
+    for line in f:  # Iterate through lines
+        parts = line.strip().split(",")  # Split by CSV commas
+        if len(parts) >= 3:               # Ensure we have at least id and name columns (fix: was >=2)
+            # Original field identifier (e.g., "34-0.0")
+            field_id = parts[0]
+            field_name = parts[2]         # Human-readable column name
+            field_dict[field_id] = field_name  # Record the mapping
+            # Track as a potential tabular feature
+            tabular_fields.append(field_name)
+# Exclude raw date parts and target columns
+exclude_fields = ['year', 'month', 'Death', 'age_at_assessment']
+tabular_fields = [
+    # Filter out excluded columns
+    field for field in tabular_fields if field not in exclude_fields]
+
+# TSV mapping field IDs to ICD10-related date columns
+field_to_icd_map = "icd10_codes_mod.tsv"
+# Date-like variables to be converted to offsets
+date_vars = []
+with open(field_to_icd_map, "r", encoding="utf-8") as f:  # Open ICD10 mapping
+    for line in f:  # Iterate each mapping row
+        parts = line.strip().split()  # Split on whitespace for TSV
+        if len(parts) >= 6:           # Guard against malformed lines
+            # Map field ID to the date column name
+            field_dict[parts[0]] = parts[5]
+            date_vars.append(parts[5])       # Track date column names in order
+
+for j in range(17):                        # Map up to 17 cancer entry slots (dates and types)
+    # Cancer diagnosis date slot j
+    field_dict[f'40005-{j}.0'] = f'cancer_date_{j}'
+    field_dict[f'40006-{j}.0'] = f'cancer_type_{j}'  # Cancer type/code slot j
+
+# Number of ICD-related date columns before adding extras
+len_icd = len(date_vars)
+date_vars.extend(['Death', 'date_of_assessment'] +  # Add outcome date and assessment date
+                 # Add cancer date columns
+                 [f'cancer_date_{j}' for j in range(17)])
+
+labels_file = "labels.csv"  # File listing label codes
+label_dict = {}              # Map code string -> integer label id
+with open(labels_file, "r", encoding="utf-8") as f:  # Open labels file
+    for idx, line in enumerate(f):  # Enumerate to assign incremental label IDs
+        parts = line.strip().split(' ')  # Split by space
+        if parts and parts[0]:           # Guard against empty lines
+            # Map code to index (0 for padding, 1 for checkup)
+            label_dict[parts[0]] = idx + 2
+
+event_list = []  # Accumulator for event arrays across chunks
+tabular_list = []  # Accumulator for tabular feature DataFrames across chunks
+ukb_iterator = pd.read_csv(  # Stream UK Biobank data in chunks
+    "ukb_data.csv",
+    sep=',',
+    chunksize=10000,          # Stream file in manageable chunks to reduce memory footprint
+    # First column (participant ID) becomes DataFrame index
+    index_col=0,
+    low_memory=False         # Disable type inference optimization for consistent dtypes
+)
+# Iterate chunks with progress
+for ukb_chunk in tqdm.tqdm(ukb_iterator, desc="Processing UK Biobank data"):
+    # Rename columns to friendly names
+    ukb_chunk = ukb_chunk.rename(columns=field_dict)
+    # Require sex to be present
+    ukb_chunk.dropna(subset=['sex'], inplace=True)
+
+    # Construct date of birth from year and month (day fixed to 1)
+    ukb_chunk['dob'] = pd.to_datetime(
+        # Guard against malformed dates
+        ukb_chunk[['year', 'month']].assign(DAY=1), errors='coerce'
+    )
+
+    # Use only date variables that actually exist in the current chunk
+    present_date_vars = [c for c in date_vars if c in ukb_chunk.columns]
+
+    # Convert date-like columns to datetime and compute day offsets from dob
+    if present_date_vars:
+        date_cols = ukb_chunk[present_date_vars].apply(
+            pd.to_datetime, format="%Y-%m-%d", errors='coerce'  # Parse dates safely
+        )
+        date_cols_days = date_cols.sub(
+            ukb_chunk['dob'], axis=0)   # Timedelta relative to dob
+        ukb_chunk[present_date_vars] = date_cols_days.apply(
+            lambda x: x.dt.days)  # Store days since dob
+
+    ukb_chunk = ukb_chunk.convert_dtypes()
+
+    # Append tabular features (use only columns that exist)
+    present_tabular_fields = [
+        c for c in tabular_fields if c in ukb_chunk.columns]
+    tabular_list.append(ukb_chunk[present_tabular_fields].copy())
+
+    # Process disease events from ICD10-related date columns
+    # Take ICD date cols plus 'Death' if present by order
+    icd10_cols = present_date_vars[:len_icd + 1]
+    # Melt to long form: participant id, event code (column name), and days offset
+    melted_df = ukb_chunk.reset_index().melt(
+        id_vars=['eid'],
+        value_vars=icd10_cols,
+        var_name='event_code',
+        value_name='days',
+    )
+    # Require non-missing day offsets
+    melted_df.dropna(subset=['days'], inplace=True)
+    if not melted_df.empty:
+        melted_df['label'] = melted_df['event_code'].map(
+            label_dict)  # Map event code to numeric label
+        # Fix: ensure labels exist before int cast
+        melted_df.dropna(subset=['label'], inplace=True)
+        if not melted_df.empty:
+            event_list.append(
+                melted_df[['eid', 'days', 'label']]
+                .astype(int)  # Safe now since label and days are non-null
+                .to_numpy()
+            )
+
+    # Add assesment date as a "checkup" event (label=1)
+    if 'date_of_assessment' in ukb_chunk.columns:
+        assessment_array = (
+            ukb_chunk.reset_index()[['eid', 'date_of_assessment']]
+            .dropna()
+            .assign(label=1)  # Checkup label
+            .astype(int)
+            .to_numpy()
+        )
+        if assessment_array.size > 0:
+            event_list.append(assessment_array)  # Append checkup events
+
+    df_res = ukb_chunk.reset_index()  # Bring participant ID out of index
+    # Simplify stub names for wide_to_long
+    # Rename date stubs
+    rename_dict = {f'cancer_date_{j}': f'cancerdate{j}' for j in range(17)}
+    rename_dict.update(
+        # Rename type stubs
+        {f'cancer_type_{j}': f'cancertype{j}' for j in range(17)})
+    df_renamed = df_res.rename(columns=rename_dict)  # Apply renaming
+    stubs_to_use = []  # Collect available stubs
+    if any('cancerdate' in col for col in df_renamed.columns):
+        stubs_to_use.append('cancerdate')  # Date stub present
+    if any('cancertype' in col for col in df_renamed.columns):
+        stubs_to_use.append('cancertype')  # Type stub present
+
+    if len(stubs_to_use) == 2:  # Only proceed if both date and type columns exist
+        long_cancer = pd.wide_to_long(
+            df_renamed,
+            stubnames=stubs_to_use,
+            i=['eid'],        # Participant ID identifier
+            j='cancer_num'    # Index over cancer record number (0..16)
+        ).dropna()              # Remove rows missing either date or type
+        if not long_cancer.empty:
+            long_cancer['cancer'] = long_cancer['cancertype'].str.slice(
+                0, 3)       # Use first 3 chars as code
+            long_cancer['cancer_label'] = long_cancer['cancer'].map(
+                label_dict)     # Map to label id
+            cancer_array = (
+                long_cancer.reset_index(
+                )[['eid', 'cancerdate', 'cancer_label']]
+                .dropna()
+                .astype(int)
+                .to_numpy()
+            )
+            if cancer_array.size > 0:
+                event_list.append(cancer_array)  # Append cancer events
+
+# Combine tabular chunks
+final_tabular = pd.concat(tabular_list, axis=0, ignore_index=False)
+final_tabular.index.name = 'eid'  # Ensure index named consistently
+data = np.vstack(event_list)      # Stack all event arrays into one
+
+# Sort by participant then day
+data = data[np.lexsort((data[:, 1], data[:, 0]))]
+
+# Keep only events with non-negative day offsets
+data = data[data[:, 1] >= 0]
+
+# Remove duplicate (participant_id, label) pairs keeping first occurrence.
+data = pd.DataFrame(data).drop_duplicates([0, 2]).values
+
+# Store compactly using unsigned 32-bit integers
+data = data.astype(np.uint32)
+
+# Split data into train/val/test sets by participant ID
+unique_ids = np.unique(data[:, 0])  # Unique participant IDs
+# ID cutoff for train
+train_split_id = unique_ids[int(len(unique_ids) * train_frac)]
+# ID cutoff for val
+val_split_id = unique_ids[int(len(unique_ids) * (train_frac + val_frac))]
+
+train_data = data[data[:, 0] <= train_split_id].tofile("ukb_train.bin")
+val_data = data[(data[:, 0] > train_split_id) & (
+    data[:, 0] <= val_split_id)].tofile("ukb_val.bin")
+test_data = data[data[:, 0] > val_split_id].tofile("ukb_test.bin")
+train_tabular = final_tabular[final_tabular.index <= train_split_id]
+val_tabular = final_tabular[(final_tabular.index > train_split_id) & (
+    final_tabular.index <= val_split_id)]
+test_tabular = final_tabular[final_tabular.index > val_split_id]
+train_tabular.to_csv("ukb_train_tabular.csv")
+val_tabular.to_csv("ukb_val_tabular.csv")
+test_tabular.to_csv("ukb_test_tabular.csv")