import pandas as pd import tqdm import numpy as np label_files = 'labels.csv' ukb_field_to_icd10_file = 'icd10_codes_mod.tsv' ukb_basket_file = 'ukb_delphi.txt' train_proportion = 0.8 output_prefix = 'ukb_real' icdict = {} icdcodes = [] with open(ukb_field_to_icd10_file) as f: for line in f: parts = line.strip().split() icdict[parts[0]] = parts[5] icdcodes.append(parts[5]) # Using enumerate for cleaner, safer label assignment starting from 0 label_dict = {} with open(label_files) as f: for i, line in enumerate(f): label_dict[line.strip().split(' ')[0]] = i icdict['f.31.0.0'] = "sex" icdict['f.34.0.0'] = "YEAR" icdict['f.52.0.0'] = "MONTH" icdict['f.40000.0.0'] = "Death" for j in range(17): icdict[f'f.40005.{j}.0'] = f'cancer_date_{j}' icdict[f'f.40006.{j}.0'] = f'cancer_type_{j}' icdict['f.53.0.0'] = "assessment_date" icdict['f.21001.0.0'] = "BMI" icdict['f.1239.0.0'] = "smoking" icdict['f.1558.0.0'] = "alcohol" len_icd = len(icdcodes) # Corrected typo 'aseessment_date' to 'assessment_date' icdcodes.extend(['Death', 'assessment_date'] + [f'cancer_date_{j}' for j in range(17)]) data_list = [] ukb_iterator = pd.read_csv(ukb_basket_file, sep=',', chunksize=10000, index_col=0, low_memory=False) for _, dd in tqdm.tqdm(enumerate(ukb_iterator)): dd = dd.rename(columns=icdict) dd.dropna(subset=['sex'], inplace=True) dd['sex'] += 1 dd = dd[[col for col in dd.columns if not col.startswith('f.')]] dd['dob'] = pd.to_datetime(dd[['YEAR', 'MONTH']].assign(DAY=1)) present_icdcodes = [c for c in icdcodes if c in dd.columns] if present_icdcodes: # Convert date columns to days from date of birth date_cols = dd[present_icdcodes].apply(pd.to_datetime, format="%Y-%m-%d", errors='coerce') date_cols_days = date_cols.sub(dd['dob'], axis=0) dd[present_icdcodes] = date_cols_days.apply(lambda x: x.dt.days) # Process ICD codes efficiently using melt cols_to_process = [col for col in icdcodes[:len_icd + 1] if col in dd.columns] if cols_to_process: melted_df = dd.reset_index().melt( id_vars=['f.eid'], value_vars=cols_to_process, var_name='event_code', value_name='days' ) melted_df.dropna(subset=['days'], inplace=True) if not melted_df.empty: melted_df['label'] = melted_df['event_code'].map(label_dict) data_list.append(melted_df[['f.eid', 'days', 'label']].dropna().astype(int).to_numpy()) # Process sex X = dd['sex'].reset_index().to_numpy().astype(int) data_list.append(np.c_[X[:, 0], np.zeros(X.shape[0]), X[:, 1]].astype(int)) # Process cancer data efficiently using wide_to_long df_res = dd.reset_index() rename_dict = {f'cancer_date_{j}': f'cancerdate{j}' for j in range(17)} rename_dict.update({f'cancer_type_{j}': f'cancertype{j}' for j in range(17)}) df_renamed = df_res.rename(columns=rename_dict) stubs_to_use = [] if any('cancerdate' in col for col in df_renamed.columns): stubs_to_use.append('cancerdate') if any('cancertype' in col for col in df_renamed.columns): stubs_to_use.append('cancertype') if len(stubs_to_use) == 2: long_cancer = pd.wide_to_long(df_renamed, stubnames=stubs_to_use, i=['f.eid'], j='cancer_num' ).dropna() if not long_cancer.empty: long_cancer['cancer'] = long_cancer['cancertype'].str.slice(0, 3) long_cancer['cancer_label'] = long_cancer['cancer'].map(label_dict) cancer_array = long_cancer.reset_index()[['f.eid', 'cancerdate', 'cancer_label']].dropna().astype(int).to_numpy() if cancer_array.size > 0: data_list.append(cancer_array) # Process BMI, smoking, and alcohol dd_bmi = dd[['assessment_date', 'BMI']].dropna().reset_index() if not dd_bmi.empty: dd_bmi['bmi_status'] = np.select([dd_bmi['BMI'] > 28, dd_bmi['BMI'] > 22], [5, 4], default=3) data_list.append(dd_bmi[['f.eid', 'assessment_date', 'bmi_status']].astype(int).to_numpy()) dd_sm = dd[['assessment_date', 'smoking']].dropna().reset_index() dd_sm = dd_sm[dd_sm['smoking'] != -3] if not dd_sm.empty: dd_sm['smoking_status'] = np.select([dd_sm['smoking'] == 1, dd_sm['smoking'] == 2], [8, 7], default=6) data_list.append(dd_sm[['f.eid', 'assessment_date', 'smoking_status']].astype(int).to_numpy()) dd_al = dd[['assessment_date', 'alcohol']].dropna().reset_index() dd_al = dd_al[dd_al['alcohol'] != -3] if not dd_al.empty: dd_al['alcohol_status'] = np.select([dd_al['alcohol'] == 1, dd_al['alcohol'] < 4], [11, 10], default=9) data_list.append(dd_al[['f.eid', 'assessment_date', 'alcohol_status']].astype(int).to_numpy()) data = np.vstack(data_list) data = data[np.lexsort((data[:, 1], data[:, 2] == data[:, 2].max(), data[:, 0]))] data = data[data[:, 1] >= 0] data = pd.DataFrame(data).drop_duplicates([0, 2]).values data = data.astype(np.uint32) data.tofile(output_prefix + '.bin') # Correctly split train/validation sets unique_ids = np.unique(data[:, 0]) split_id = unique_ids[int(len(unique_ids) * train_proportion)] train_val_split = data[:, 0] <= split_id data[train_val_split].tofile(output_prefix + '_train.bin') data[~train_val_split].tofile(output_prefix + '_val.bin')