From cb7adb70d9c7cc042f76ce8b793a9476e61b82d6 Mon Sep 17 00:00:00 2001 From: Jiarui Li Date: Fri, 5 Dec 2025 00:54:56 +0800 Subject: [PATCH] Refactor data preparation and add loss functions for model training - Removed `prepare_data.py` as it is no longer needed. - Introduced `losses.py` containing ExponentialNLLLoss and WeibullLosses classes for calculating negative log-likelihood losses with regularization. - Added `model.py` which defines the DelphiFork model architecture, including a tabular encoder for handling continuous and categorical features, and merging sequences based on time order. --- age_encoder.py | 40 ++ backbones.py | 164 +++++ delphi_fork/labels.csv | 1270 ----------------------------------- delphi_fork/prepare_data.py | 216 ------ losses.py | 112 +++ model.py | 129 ++++ 6 files changed, 445 insertions(+), 1486 deletions(-) create mode 100644 age_encoder.py create mode 100644 backbones.py delete mode 100644 delphi_fork/labels.csv delete mode 100644 delphi_fork/prepare_data.py create mode 100644 losses.py create mode 100644 model.py diff --git a/age_encoder.py b/age_encoder.py new file mode 100644 index 0000000..566328b --- /dev/null +++ b/age_encoder.py @@ -0,0 +1,40 @@ +import torch +import torch.nn as nn + +class AgeSinusoidalEncoder(nn.Module): + def __init__(self, n_embd: int): + super().__init__() + if n_embd % 2 != 0: + raise ValueError("n_embd must be even for sinusoidal encoding.") + self.n_embd = n_embd + i = torch.arange(0, self.n_embd, 2, dtype=torch.float32) + divisor = torch.pow(10000, i / self.n_embd) + self.register_buffer('divisor', divisor) + + def forward(self, ages: torch.Tensor) -> torch.Tensor: + t_years = ages / 365.25 + # Broadcast (B, L, 1) against (1, 1, D/2) to get (B, L, D/2) + args = t_years.unsqueeze(-1) / self.divisor.view(1, 1, -1) + # Interleave cos and sin along the last dimension + output = torch.zeros( + ages.shape[0], ages.shape[1], self.n_embd, device=ages.device) + output[:, :, 0::2] = torch.cos(args) + output[:, :, 1::2] = torch.sin(args) + return output + +class AgeMLPEncoder(nn.Module): + def __init__(self, n_embd: int): + super().__init__() + self.mlp = nn.Sequential( + nn.Linear(2, 4 * n_embd), + nn.ReLU(), + nn.Linear(4 * n_embd, n_embd), + ) + + def forward(self, ages: torch.Tensor) -> torch.Tensor: + ages = ages.unsqueeze(-1).float() # (B, L, 1) + ages_normalized = ages / 365.25 # normalize to years + log1page = torch.log1p(ages_normalized) # (B, L, 1) + ages = torch.cat([ages_normalized, log1page], dim=-1) # (B, L, 2) + output = self.mlp(ages) # (B, L, n_embd) + return output \ No newline at end of file diff --git a/backbones.py b/backbones.py new file mode 100644 index 0000000..65b139f --- /dev/null +++ b/backbones.py @@ -0,0 +1,164 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional + +class RMSNorm(nn.Module): + def __init__( + self, + n_embd: int, + eps: float = 1e-8, + ): + super().__init__() + self.n_embd = n_embd + self.eps = eps + self.weight = nn.Parameter(torch.ones(n_embd)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + norm_x = x.norm(2, dim=-1, keepdim=True) + rms_x = norm_x * (self.n_embd ** -0.5) + x_normed = x / (rms_x + self.eps) + return self.weight * x_normed + +class SelfAttention(nn.Module): + def __init__( + self, + n_embd: int, + n_head: int, + attn_pdrop: float = 0.1, + ): + super().__init__() + assert n_embd % n_head == 0, "n_embd must be divisible by n_head" + self.n_head = n_head + self.head_dim = n_embd // n_head + + self.qkv_proj = nn.Linear(n_embd, 3 * n_embd, bias=False) + self.o_proj = nn.Linear(n_embd, n_embd, bias=False) + self.attn_pdrop = attn_pdrop + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + B, L, D = x.shape + qkv = self.qkv_proj(x) # (B, L, 3D) + q, k, v = qkv.chunk(3, dim=-1) + + def reshape_heads(t): + # (B, H, L, d) + return t.view(B, L, self.n_head, self.head_dim).transpose(1, 2) + + q = reshape_heads(q) + k = reshape_heads(k) + v = reshape_heads(v) + + attn = F.scaled_dot_product_attention( + q, k, v, + attn_mask=attn_mask, + dropout_p=self.attn_pdrop, + ) # (B, H, L, d) + + attn = attn.transpose(1, 2).contiguous().view(B, L, D) # (B, L, D) + return self.o_proj(attn) + +class SwiGLUMLP(nn.Module): + def __init__( + self, + n_embd: int, + pdrop: float = 0.0, + ): + super().__init__() + hidden_dim = 4 * n_embd + self.fc1 = nn.Linear(n_embd, 2 * hidden_dim, bias=False) + self.fc2 = nn.Linear(hidden_dim, n_embd, bias=False) + self.dropout = nn.Dropout(pdrop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x1, x2 = self.fc1(x).chunk(2, dim=-1) + # SwiGLU: silu(x1) * x2 + x = F.silu(x1) * x2 + x = self.fc2(x) + return self.dropout(x) + +class Block(nn.Module): + def __init__( + self, + n_embd: int, + n_head: int, + pdrop: float = 0.0, + ): + super().__init__() + attn_pdrop = pdrop + + self.norm_1 = nn.LayerNorm(n_embd) + self.attn = SelfAttention( + n_embd=n_embd, + n_head=n_head, + attn_pdrop=attn_pdrop, + ) + self.norm_2 = nn.LayerNorm(n_embd) + self.mlp = nn.ModuleDict(dict( + c_fc=nn.Linear(n_embd, 4 * n_embd), + c_proj=nn.Linear(4 * n_embd, n_embd), + act=nn.GELU(), + dropout=nn.Dropout(pdrop), + )) + m = self.mlp + self.mlpf = lambda x: m.dropout( + m.c_proj(m.act(m.c_fc(x)))) + self.resid_dropout = nn.Dropout(pdrop) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Attention + h = self.norm_1(x) + h = self.attn(h, attn_mask=attn_mask) + x = x + self.resid_dropout(h) + + # MLP + h = self.norm_2(x) + h = self.mlpf(h) + x = x + self.resid_dropout(h) + + return x + +class ModernBlock(nn.Module): + def __init__( + self, + n_embd: int, + n_head: int, + pdrop: float = 0.0, + ): + super().__init__() + attn_pdrop = pdrop + mlp_pdrop = pdrop + + self.norm_1 = RMSNorm(n_embd) + self.attn = SelfAttention( + n_embd=n_embd, + n_head=n_head, + attn_pdrop=attn_pdrop, + ) + self.norm_2 = RMSNorm(n_embd) + self.mlp = SwiGLUMLP(n_embd=n_embd, pdrop=mlp_pdrop) + self.resid_dropout = nn.Dropout(pdrop) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + h = self.norm_1(x) + h = self.attn(h, attn_mask=attn_mask) + x = x + self.resid_dropout(h) + + # MLP + h = self.norm_2(x) + h = self.mlp(h) + x = x + self.resid_dropout(h) + + return x \ No newline at end of file diff --git a/delphi_fork/labels.csv b/delphi_fork/labels.csv deleted file mode 100644 index bf3b573..0000000 --- a/delphi_fork/labels.csv +++ /dev/null @@ -1,1270 +0,0 @@ -Padding -No event -Female -Male -BMI_low -BMI_mid -BMI_high -Smoking_low -Smoking_mid -Smoking_high -Alcohol_low -Alcohol_mid -Alcohol_high -A00 (cholera) -A01 (typhoid and paratyphoid fevers) -A02 (other salmonella infections) -A03 (shigellosis) -A04 (other bacterial intestinal infections) -A05 (other bacterial foodborne intoxications) -A06 (amoebiasis) -A07 (other protozoal intestinal diseases) -A08 (viral and other specified intestinal infections) -A09 (diarrhoea and gastro-enteritis of presumed infectious origin) -A15 (respiratory tuberculosis, bacteriologically and histologically confirmed) -A16 (respiratory tuberculosis, not confirmed bacteriologically or histologically) -A17 (tuberculosis of nervous system) -A18 (tuberculosis of other organs) -A19 (miliary tuberculosis) -A20 (plague) -A22 (anthrax) -A23 (brucellosis) -A24 (glanders and melioidosis) -A25 (rat-bite fevers) -A26 (erysipeloid) -A27 (leptospirosis) -A28 (other zoonotic bacterial diseases, not elsewhere classified) -A30 (leprosy [hansen's disease]) -A31 (infection due to other mycobacteria) -A32 (listeriosis) -A33 (tetanus neonatorum) -A35 (other tetanus) -A36 (diphtheria) -A37 (whooping cough) -A38 (scarlet fever) -A39 (meningococcal infection) -A40 (streptococcal septicaemia) -A41 (other septicaemia) -A42 (actinomycosis) -A43 (nocardiosis) -A44 (bartonellosis) -A46 (erysipelas) -A48 (other bacterial diseases, not elsewhere classified) -A49 (bacterial infection of unspecified site) -A50 (congenital syphilis) -A51 (early syphilis) -A52 (late syphilis) -A53 (other and unspecified syphilis) -A54 (gonococcal infection) -A55 (chlamydial lymphogranuloma (venereum)) -A56 (other sexually transmitted chlamydial diseases) -A58 (granuloma inguinale) -A59 (trichomoniasis) -A60 (anogenital herpesviral [herpes simplex] infections) -A63 (other predominantly sexually transmitted diseases, not elsewhere classified) -A64 (unspecified sexually transmitted disease) -A66 (yaws) -A67 (pinta [carate]) -A68 (relapsing fevers) -A69 (other spirochaetal infections) -A70 (chlamydia psittaci infection) -A71 (trachoma) -A74 (other diseases caused by chlamydiae) -A75 (typhus fever) -A77 (spotted fever [tick-borne rickettsioses]) -A78 (q fever) -A79 (other rickettsioses) -A80 (acute poliomyelitis) -A81 (atypical virus infections of central nervous system) -A82 (rabies) -A83 (mosquito-borne viral encephalitis) -A84 (tick-borne viral encephalitis) -A85 (other viral encephalitis, not elsewhere classified) -A86 (unspecified viral encephalitis) -A87 (viral meningitis) -A88 (other viral infections of central nervous system, not elsewhere classified) -A89 (unspecified viral infection of central nervous system) -A90 (dengue fever [classical dengue]) -A91 (dengue haemorrhagic fever) -A92 (other mosquito-borne viral fevers) -A93 (other arthropod-borne viral fevers, not elsewhere classified) -A94 (unspecified arthropod-borne viral fever) -A95 (yellow fever) -A97 (dengue) -A98 (other viral haemorrhagic fevers, not elsewhere classified) -B00 (herpesviral [herpes simplex] infections) -B01 (varicella [chickenpox]) -B02 (zoster [herpes zoster]) -B03 (smallpox) -B05 (measles) -B06 (rubella [german measles]) -B07 (viral warts) -B08 (other viral infections characterised by skin and mucous membrane lesions, not elsewhere classified) -B09 (unspecified viral infection characterised by skin and mucous membrane lesions) -B15 (acute hepatitis a) -B16 (acute hepatitis b) -B17 (other acute viral hepatitis) -B18 (chronic viral hepatitis) -B19 (unspecified viral hepatitis) -B20 (human immunodeficiency virus [hiv] disease resulting in infectious and parasitic diseases) -B21 (human immunodeficiency virus [hiv] disease resulting in malignant neoplasms) -B22 (human immunodeficiency virus [hiv] disease resulting in other specified diseases) -B23 (human immunodeficiency virus [hiv] disease resulting in other conditions) -B24 (unspecified human immunodeficiency virus [hiv] disease) -B25 (cytomegaloviral disease) -B26 (mumps) -B27 (infectious mononucleosis) -B30 (viral conjunctivitis) -B33 (other viral diseases, not elsewhere classified) -B34 (viral infection of unspecified site) -B35 (dermatophytosis) -B36 (other superficial mycoses) -B37 (candidiasis) -B38 (coccidioidomycosis) -B39 (histoplasmosis) -B40 (blastomycosis) -B42 (sporotrichosis) -B43 (chromomycosis and phaeomycotic abscess) -B44 (aspergillosis) -B45 (cryptococcosis) -B46 (zygomycosis) -B47 (mycetoma) -B48 (other mycoses, not elsewhere classified) -B49 (unspecified mycosis) -B50 (plasmodium falciparum malaria) -B51 (plasmodium vivax malaria) -B52 (plasmodium malariae malaria) -B53 (other parasitologically confirmed malaria) -B54 (unspecified malaria) -B55 (leishmaniasis) -B57 (chagas' disease) -B58 (toxoplasmosis) -B59 (pneumocystosis) -B60 (other protozoal diseases, not elsewhere classified) -B65 (schistosomiasis [bilharziasis]) -B66 (other fluke infections) -B67 (echinococcosis) -B68 (taeniasis) -B69 (cysticercosis) -B71 (other cestode infections) -B73 (onchocerciasis) -B74 (filariasis) -B75 (trichinellosis) -B76 (hookworm diseases) -B77 (ascariasis) -B78 (strongyloidiasis) -B79 (trichuriasis) -B80 (enterobiasis) -B81 (other intestinal helminthiases, not elsewhere classified) -B82 (unspecified intestinal parasitism) -B83 (other helminthiases) -B85 (pediculosis and phthiriasis) -B86 (scabies) -B87 (myiasis) -B88 (other infestations) -B89 (unspecified parasitic disease) -B90 (sequelae of tuberculosis) -B91 (sequelae of poliomyelitis) -B94 (sequelae of other and unspecified infectious and parasitic diseases) -B95 (streptococcus and staphylococcus as the cause of diseases classified to other chapters) -B96 (other bacterial agents as the cause of diseases classified to other chapters) -B97 (viral agents as the cause of diseases classified to other chapters) -B98 (other specified infectious agents as the cause of diseases classified to other chapters) -B99 (other and unspecified infectious diseases) -D50 (iron deficiency anaemia) -D51 (vitamin b12 deficiency anaemia) -D52 (folate deficiency anaemia) -D53 (other nutritional anaemias) -D55 (anaemia due to enzyme disorders) -D56 (thalassaemia) -D57 (sickle-cell disorders) -D58 (other hereditary haemolytic anaemias) -D59 (acquired haemolytic anaemia) -D60 (acquired pure red cell aplasia [erythroblastopenia]) -D61 (other aplastic anaemias) -D62 (acute posthaemorrhagic anaemia) -D63 (anaemia in chronic diseases classified elsewhere) -D64 (other anaemias) -D65 (disseminated intravascular coagulation [defibrination syndrome]) -D66 (hereditary factor viii deficiency) -D67 (hereditary factor ix deficiency) -D68 (other coagulation defects) -D69 (purpura and other haemorrhagic conditions) -D70 (agranulocytosis) -D71 (functional disorders of polymorphonuclear neutrophils) -D72 (other disorders of white blood cells) -D73 (diseases of spleen) -D74 (methaemoglobinaemia) -D75 (other diseases of blood and blood-forming organs) -D76 (certain diseases involving lymphoreticular tissue and reticulohistiocytic system) -D77 (other disorders of blood and blood-forming organs in diseases classified elsewhere) -D80 (immunodeficiency with predominantly antibody defects) -D81 (combined immunodeficiencies) -D82 (immunodeficiency associated with other major defects) -D83 (common variable immunodeficiency) -D84 (other immunodeficiencies) -D86 (sarcoidosis) -D89 (other disorders involving the immune mechanism, not elsewhere classified) -E01 (iodine-deficiency-related thyroid disorders and allied conditions) -E02 (subclinical iodine-deficiency hypothyroidism) -E03 (other hypothyroidism) -E04 (other non-toxic goitre) -E05 (thyrotoxicosis [hyperthyroidism]) -E06 (thyroiditis) -E07 (other disorders of thyroid) -E10 (insulin-dependent diabetes mellitus) -E11 (non-insulin-dependent diabetes mellitus) -E12 (malnutrition-related diabetes mellitus) -E13 (other specified diabetes mellitus) -E14 (unspecified diabetes mellitus) -E15 (nondiabetic hypoglycaemic coma) -E16 (other disorders of pancreatic internal secretion) -E20 (hypoparathyroidism) -E21 (hyperparathyroidism and other disorders of parathyroid gland) -E22 (hyperfunction of pituitary gland) -E23 (hypofunction and other disorders of pituitary gland) -E24 (cushing's syndrome) -E25 (adrenogenital disorders) -E26 (hyperaldosteronism) -E27 (other disorders of adrenal gland) -E28 (ovarian dysfunction) -E29 (testicular dysfunction) -E30 (disorders of puberty, not elsewhere classified) -E31 (polyglandular dysfunction) -E32 (diseases of thymus) -E34 (other endocrine disorders) -E35 (disorders of endocrine glands in diseases classified elsewhere) -E41 (nutritional marasmus) -E43 (unspecified severe protein-energy malnutrition) -E44 (protein-energy malnutrition of moderate and mild degree) -E45 (retarded development following protein-energy malnutrition) -E46 (unspecified protein-energy malnutrition) -E50 (vitamin a deficiency) -E51 (thiamine deficiency) -E52 (niacin deficiency [pellagra]) -E53 (deficiency of other b group vitamins) -E54 (ascorbic acid deficiency) -E55 (vitamin d deficiency) -E56 (other vitamin deficiencies) -E58 (dietary calcium deficiency) -E59 (dietary selenium deficiency) -E60 (dietary zinc deficiency) -E61 (deficiency of other nutrient elements) -E63 (other nutritional deficiencies) -E64 (sequelae of malnutrition and other nutritional deficiencies) -E65 (localised adiposity) -E66 (obesity) -E67 (other hyperalimentation) -E68 (sequelae of hyperalimentation) -E70 (disorders of aromatic amino-acid metabolism) -E71 (disorders of branched-chain amino-acid metabolism and fatty-acid metabolism) -E72 (other disorders of amino-acid metabolism) -E73 (lactose intolerance) -E74 (other disorders of carbohydrate metabolism) -E75 (disorders of sphingolipid metabolism and other lipid storage disorders) -E76 (disorders of glycosaminoglycan metabolism) -E77 (disorders of glycoprotein metabolism) -E78 (disorders of lipoprotein metabolism and other lipidaemias) -E79 (disorders of purine and pyrimidine metabolism) -E80 (disorders of porphyrin and bilirubin metabolism) -E83 (disorders of mineral metabolism) -E84 (cystic fibrosis) -E85 (amyloidosis) -E86 (volume depletion) -E87 (other disorders of fluid, electrolyte and acid-base balance) -E88 (other metabolic disorders) -E89 (postprocedural endocrine and metabolic disorders, not elsewhere classified) -F00 (dementia in alzheimer's disease) -F01 (vascular dementia) -F02 (dementia in other diseases classified elsewhere) -F03 (unspecified dementia) -F04 (organic amnesic syndrome, not induced by alcohol and other psychoactive substances) -F05 (delirium, not induced by alcohol and other psychoactive substances) -F06 (other mental disorders due to brain damage and dysfunction and to physical disease) -F07 (personality and behavioural disorders due to brain disease, damage and dysfunction) -F09 (unspecified organic or symptomatic mental disorder) -F10 (mental and behavioural disorders due to use of alcohol) -F11 (mental and behavioural disorders due to use of opioids) -F12 (mental and behavioural disorders due to use of cannabinoids) -F13 (mental and behavioural disorders due to use of sedatives or hypnotics) -F14 (mental and behavioural disorders due to use of cocaine) -F15 (mental and behavioural disorders due to use of other stimulants, including caffeine) -F16 (mental and behavioural disorders due to use of hallucinogens) -F17 (mental and behavioural disorders due to use of tobacco) -F18 (mental and behavioural disorders due to use of volatile solvents) -F19 (mental and behavioural disorders due to multiple drug use and use of other psychoactive substances) -F20 (schizophrenia) -F21 (schizotypal disorder) -F22 (persistent delusional disorders) -F23 (acute and transient psychotic disorders) -F24 (induced delusional disorder) -F25 (schizoaffective disorders) -F28 (other nonorganic psychotic disorders) -F29 (unspecified nonorganic psychosis) -F30 (manic episode) -F31 (bipolar affective disorder) -F32 (depressive episode) -F33 (recurrent depressive disorder) -F34 (persistent mood [affective] disorders) -F38 (other mood [affective] disorders) -F39 (unspecified mood [affective] disorder) -F40 (phobic anxiety disorders) -F41 (other anxiety disorders) -F42 (obsessive-compulsive disorder) -F43 (reaction to severe stress, and adjustment disorders) -F44 (dissociative [conversion] disorders) -F45 (somatoform disorders) -F48 (other neurotic disorders) -F50 (eating disorders) -F51 (nonorganic sleep disorders) -F52 (sexual dysfunction, not caused by organic disorder or disease) -F53 (mental and behavioural disorders associated with the puerperium, not elsewhere classified) -F54 (psychological and behavioural factors associated with disorders or diseases classified elsewhere) -F55 (abuse of non-dependence-producing substances) -F59 (unspecified behavioural syndromes associated with physiological disturbances and physical factors) -F60 (specific personality disorders) -F61 (mixed and other personality disorders) -F62 (enduring personality changes, not attributable to brain damage and disease) -F63 (habit and impulse disorders) -F64 (gender identity disorders) -F65 (disorders of sexual preference) -F66 (psychological and behavioural disorders associated with sexual development and orientation) -F68 (other disorders of adult personality and behaviour) -F69 (unspecified disorder of adult personality and behaviour) -F70 (mild mental retardation) -F71 (moderate mental retardation) -F72 (severe mental retardation) -F78 (other mental retardation) -F79 (unspecified mental retardation) -F80 (specific developmental disorders of speech and language) -F81 (specific developmental disorders of scholastic skills) -F82 (specific developmental disorder of motor function) -F83 (mixed specific developmental disorders) -F84 (pervasive developmental disorders) -F88 (other disorders of psychological development) -F89 (unspecified disorder of psychological development) -F90 (hyperkinetic disorders) -F91 (conduct disorders) -F92 (mixed disorders of conduct and emotions) -F93 (emotional disorders with onset specific to childhood) -F94 (disorders of social functioning with onset specific to childhood and adolescence) -F95 (tic disorders) -F98 (other behavioural and emotional disorders with onset usually occurring in childhood and adolescence) -F99 (mental disorder, not otherwise specified) -G00 (bacterial meningitis, not elsewhere classified) -G01 (meningitis in bacterial diseases classified elsewhere) -G02 (meningitis in other infectious and parasitic diseases classified elsewhere) -G03 (meningitis due to other and unspecified causes) -G04 (encephalitis, myelitis and encephalomyelitis) -G05 (encephalitis, myelitis and encephalomyelitis in diseases classified elsewhere) -G06 (intracranial and intraspinal abscess and granuloma) -G07 (intracranial and intraspinal abscess and granuloma in diseases classified elsewhere) -G08 (intracranial and intraspinal phlebitis and thrombophlebitis) -G09 (sequelae of inflammatory diseases of central nervous system) -G10 (huntington's disease) -G11 (hereditary ataxia) -G12 (spinal muscular atrophy and related syndromes) -G13 (systemic atrophies primarily affecting central nervous system in diseases classified elsewhere) -G14 (postpolio syndrome) -G20 (parkinson's disease) -G21 (secondary parkinsonism) -G22 (parkinsonism in diseases classified elsewhere) -G23 (other degenerative diseases of basal ganglia) -G24 (dystonia) -G25 (other extrapyramidal and movement disorders) -G30 (alzheimer's disease) -G31 (other degenerative diseases of nervous system, not elsewhere classified) -G32 (other degenerative disorders of nervous system in diseases classified elsewhere) -G35 (multiple sclerosis) -G36 (other acute disseminated demyelination) -G37 (other demyelinating diseases of central nervous system) -G40 (epilepsy) -G41 (status epilepticus) -G43 (migraine) -G44 (other headache syndromes) -G45 (transient cerebral ischaemic attacks and related syndromes) -G46 (vascular syndromes of brain in cerebrovascular diseases) -G47 (sleep disorders) -G50 (disorders of trigeminal nerve) -G51 (facial nerve disorders) -G52 (disorders of other cranial nerves) -G53 (cranial nerve disorders in diseases classified elsewhere) -G54 (nerve root and plexus disorders) -G55 (nerve root and plexus compressions in diseases classified elsewhere) -G56 (mononeuropathies of upper limb) -G57 (mononeuropathies of lower limb) -G58 (other mononeuropathies) -G59 (mononeuropathy in diseases classified elsewhere) -G60 (hereditary and idiopathic neuropathy) -G61 (inflammatory polyneuropathy) -G62 (other polyneuropathies) -G63 (polyneuropathy in diseases classified elsewhere) -G64 (other disorders of peripheral nervous system) -G70 (myasthenia gravis and other myoneural disorders) -G71 (primary disorders of muscles) -G72 (other myopathies) -G73 (disorders of myoneural junction and muscle in diseases classified elsewhere) -G80 (infantile cerebral palsy) -G81 (hemiplegia) -G82 (paraplegia and tetraplegia) -G83 (other paralytic syndromes) -G90 (disorders of autonomic nervous system) -G91 (hydrocephalus) -G92 (toxic encephalopathy) -G93 (other disorders of brain) -G94 (other disorders of brain in diseases classified elsewhere) -G95 (other diseases of spinal cord) -G96 (other disorders of central nervous system) -G97 (postprocedural disorders of nervous system, not elsewhere classified) -G98 (other disorders of nervous system, not elsewhere classified) -G99 (other disorders of nervous system in diseases classified elsewhere) -H00 (hordeolum and chalazion) -H01 (other inflammation of eyelid) -H02 (other disorders of eyelid) -H03 (disorders of eyelid in diseases classified elsewhere) -H04 (disorders of lachrymal system) -H05 (disorders of orbit) -H06 (disorders of lachrymal system and orbit in diseases classified elsewhere) -H10 (conjunctivitis) -H11 (other disorders of conjunctiva) -H13 (disorders of conjunctiva in diseases classified elsewhere) -H15 (disorders of sclera) -H16 (keratitis) -H17 (corneal scars and opacities) -H18 (other disorders of cornea) -H19 (disorders of sclera and cornea in diseases classified elsewhere) -H20 (iridocyclitis) -H21 (other disorders of iris and ciliary body) -H22 (disorders of iris and ciliary body in diseases classified elsewhere) -H25 (senile cataract) -H26 (other cataract) -H27 (other disorders of lens) -H28 (cataract and other disorders of lens in diseases classified elsewhere) -H30 (chorioretinal inflammation) -H31 (other disorders of choroid) -H32 (chorioretinal disorders in diseases classified elsewhere) -H33 (retinal detachments and breaks) -H34 (retinal vascular occlusions) -H35 (other retinal disorders) -H36 (retinal disorders in diseases classified elsewhere) -H40 (glaucoma) -H42 (glaucoma in diseases classified elsewhere) -H43 (disorders of vitreous body) -H44 (disorders of globe) -H45 (disorders of vitreous body and globe in diseases classified elsewhere) -H46 (optic neuritis) -H47 (other disorders of optic [2nd] nerve and visual pathways) -H48 (disorders of optic [2nd] nerve and visual pathways in diseases classified elsewhere) -H49 (paralytic strabismus) -H50 (other strabismus) -H51 (other disorders of binocular movement) -H52 (disorders of refraction and accommodation) -H53 (visual disturbances) -H54 (blindness and low vision) -H55 (nystagmus and other irregular eye movements) -H57 (other disorders of eye and adnexa) -H58 (other disorders of eye and adnexa in diseases classified elsewhere) -H59 (postprocedural disorders of eye and adnexa, not elsewhere classified) -H60 (otitis externa) -H61 (other disorders of external ear) -H62 (disorders of external ear in diseases classified elsewhere) -H65 (nonsuppurative otitis media) -H66 (suppurative and unspecified otitis media) -H67 (otitis media in diseases classified elsewhere) -H68 (eustachian salpingitis and obstruction) -H69 (other disorders of eustachian tube) -H70 (mastoiditis and related conditions) -H71 (cholesteatoma of middle ear) -H72 (perforation of tympanic membrane) -H73 (other disorders of tympanic membrane) -H74 (other disorders of middle ear and mastoid) -H75 (other disorders of middle ear and mastoid in diseases classified elsewhere) -H80 (otosclerosis) -H81 (disorders of vestibular function) -H82 (vertiginous syndromes in diseases classified elsewhere) -H83 (other diseases of inner ear) -H90 (conductive and sensorineural hearing loss) -H91 (other hearing loss) -H92 (otalgia and effusion of ear) -H93 (other disorders of ear, not elsewhere classified) -H94 (other disorders of ear in diseases classified elsewhere) -H95 (postprocedural disorders of ear and mastoid process, not elsewhere classified) -I00 (rheumatic fever without mention of heart involvement) -I01 (rheumatic fever with heart involvement) -I02 (rheumatic chorea) -I05 (rheumatic mitral valve diseases) -I06 (rheumatic aortic valve diseases) -I07 (rheumatic tricuspid valve diseases) -I08 (multiple valve diseases) -I09 (other rheumatic heart diseases) -I10 (essential (primary) hypertension) -I11 (hypertensive heart disease) -I12 (hypertensive renal disease) -I13 (hypertensive heart and renal disease) -I15 (secondary hypertension) -I20 (angina pectoris) -I21 (acute myocardial infarction) -I22 (subsequent myocardial infarction) -I23 (certain current complications following acute myocardial infarction) -I24 (other acute ischaemic heart diseases) -I25 (chronic ischaemic heart disease) -I26 (pulmonary embolism) -I27 (other pulmonary heart diseases) -I28 (other diseases of pulmonary vessels) -I30 (acute pericarditis) -I31 (other diseases of pericardium) -I32 (pericarditis in diseases classified elsewhere) -I33 (acute and subacute endocarditis) -I34 (nonrheumatic mitral valve disorders) -I35 (nonrheumatic aortic valve disorders) -I36 (nonrheumatic tricuspid valve disorders) -I37 (pulmonary valve disorders) -I38 (endocarditis, valve unspecified) -I39 (endocarditis and heart valve disorders in diseases classified elsewhere) -I40 (acute myocarditis) -I41 (myocarditis in diseases classified elsewhere) -I42 (cardiomyopathy) -I43 (cardiomyopathy in diseases classified elsewhere) -I44 (atrioventricular and left bundle-branch block) -I45 (other conduction disorders) -I46 (cardiac arrest) -I47 (paroxysmal tachycardia) -I48 (atrial fibrillation and flutter) -I49 (other cardiac arrhythmias) -I50 (heart failure) -I51 (complications and ill-defined descriptions of heart disease) -I52 (other heart disorders in diseases classified elsewhere) -I60 (subarachnoid haemorrhage) -I61 (intracerebral haemorrhage) -I62 (other nontraumatic intracranial haemorrhage) -I63 (cerebral infarction) -I64 (stroke, not specified as haemorrhage or infarction) -I65 (occlusion and stenosis of precerebral arteries, not resulting in cerebral infarction) -I66 (occlusion and stenosis of cerebral arteries, not resulting in cerebral infarction) -I67 (other cerebrovascular diseases) -I68 (cerebrovascular disorders in diseases classified elsewhere) -I69 (sequelae of cerebrovascular disease) -I70 (atherosclerosis) -I71 (aortic aneurysm and dissection) -I72 (other aneurysm) -I73 (other peripheral vascular diseases) -I74 (arterial embolism and thrombosis) -I77 (other disorders of arteries and arterioles) -I78 (diseases of capillaries) -I79 (disorders of arteries, arterioles and capillaries in diseases classified elsewhere) -I80 (phlebitis and thrombophlebitis) -I81 (portal vein thrombosis) -I82 (other venous embolism and thrombosis) -I83 (varicose veins of lower extremities) -I84 (haemorrhoids) -I85 (oesophageal varices) -I86 (varicose veins of other sites) -I87 (other disorders of veins) -I88 (nonspecific lymphadenitis) -I89 (other non-infective disorders of lymphatic vessels and lymph nodes) -I95 (hypotension) -I97 (postprocedural disorders of circulatory system, not elsewhere classified) -I98 (other disorders of circulatory system in diseases classified elsewhere) -I99 (other and unspecified disorders of circulatory system) -J00 (acute nasopharyngitis [common cold]) -J01 (acute sinusitis) -J02 (acute pharyngitis) -J03 (acute tonsillitis) -J04 (acute laryngitis and tracheitis) -J05 (acute obstructive laryngitis [croup] and epiglottitis) -J06 (acute upper respiratory infections of multiple and unspecified sites) -J09 (influenza due to certain identified influenza virus) -J10 (influenza due to identified influenza virus) -J11 (influenza, virus not identified) -J12 (viral pneumonia, not elsewhere classified) -J13 (pneumonia due to streptococcus pneumoniae) -J14 (pneumonia due to haemophilus influenzae) -J15 (bacterial pneumonia, not elsewhere classified) -J16 (pneumonia due to other infectious organisms, not elsewhere classified) -J17 (pneumonia in diseases classified elsewhere) -J18 (pneumonia, organism unspecified) -J20 (acute bronchitis) -J21 (acute bronchiolitis) -J22 (unspecified acute lower respiratory infection) -J30 (vasomotor and allergic rhinitis) -J31 (chronic rhinitis, nasopharyngitis and pharyngitis) -J32 (chronic sinusitis) -J33 (nasal polyp) -J34 (other disorders of nose and nasal sinuses) -J35 (chronic diseases of tonsils and adenoids) -J36 (peritonsillar abscess) -J37 (chronic laryngitis and laryngotracheitis) -J38 (diseases of vocal cords and larynx, not elsewhere classified) -J39 (other diseases of upper respiratory tract) -J40 (bronchitis, not specified as acute or chronic) -J41 (simple and mucopurulent chronic bronchitis) -J42 (unspecified chronic bronchitis) -J43 (emphysema) -J44 (other chronic obstructive pulmonary disease) -J45 (asthma) -J46 (status asthmaticus) -J47 (bronchiectasis) -J60 (coalworker's pneumoconiosis) -J61 (pneumoconiosis due to asbestos and other mineral fibres) -J62 (pneumoconiosis due to dust containing silica) -J63 (pneumoconiosis due to other inorganic dusts) -J64 (unspecified pneumoconiosis) -J66 (airway disease due to specific organic dust) -J67 (hypersensitivity pneumonitis due to organic dust) -J68 (respiratory conditions due to inhalation of chemicals, gases, fumes and vapours) -J69 (pneumonitis due to solids and liquids) -J70 (respiratory conditions due to other external agents) -J80 (adult respiratory distress syndrome) -J81 (pulmonary oedema) -J82 (pulmonary eosinophilia, not elsewhere classified) -J84 (other interstitial pulmonary diseases) -J85 (abscess of lung and mediastinum) -J86 (pyothorax) -J90 (pleural effusion, not elsewhere classified) -J91 (pleural effusion in conditions classified elsewhere) -J92 (pleural plaque) -J93 (pneumothorax) -J94 (other pleural conditions) -J95 (postprocedural respiratory disorders, not elsewhere classified) -J96 (respiratory failure, not elsewhere classified) -J98 (other respiratory disorders) -J99 (respiratory disorders in diseases classified elsewhere) -K00 (disorders of tooth development and eruption) -K01 (embedded and impacted teeth) -K02 (dental caries) -K03 (other diseases of hard tissues of teeth) -K04 (diseases of pulp and periapical tissues) -K05 (gingivitis and periodontal diseases) -K06 (other disorders of gingiva and edentulous alveolar ridge) -K07 (dentofacial anomalies [including malocclusion]) -K08 (other disorders of teeth and supporting structures) -K09 (cysts of oral region, not elsewhere classified) -K10 (other diseases of jaws) -K11 (diseases of salivary glands) -K12 (stomatitis and related lesions) -K13 (other diseases of lip and oral mucosa) -K14 (diseases of tongue) -K20 (oesophagitis) -K21 (gastro-oesophageal reflux disease) -K22 (other diseases of oesophagus) -K23 (disorders of oesophagus in diseases classified elsewhere) -K25 (gastric ulcer) -K26 (duodenal ulcer) -K27 (peptic ulcer, site unspecified) -K28 (gastrojejunal ulcer) -K29 (gastritis and duodenitis) -K30 (dyspepsia) -K31 (other diseases of stomach and duodenum) -K35 (acute appendicitis) -K36 (other appendicitis) -K37 (unspecified appendicitis) -K38 (other diseases of appendix) -K40 (inguinal hernia) -K41 (femoral hernia) -K42 (umbilical hernia) -K43 (ventral hernia) -K44 (diaphragmatic hernia) -K45 (other abdominal hernia) -K46 (unspecified abdominal hernia) -K50 (crohn's disease [regional enteritis]) -K51 (ulcerative colitis) -K52 (other non-infective gastro-enteritis and colitis) -K55 (vascular disorders of intestine) -K56 (paralytic ileus and intestinal obstruction without hernia) -K57 (diverticular disease of intestine) -K58 (irritable bowel syndrome) -K59 (other functional intestinal disorders) -K60 (fissure and fistula of anal and rectal regions) -K61 (abscess of anal and rectal regions) -K62 (other diseases of anus and rectum) -K63 (other diseases of intestine) -K64 (haemorrhoids and perianal venous thrombosis) -K65 (peritonitis) -K66 (other disorders of peritoneum) -K67 (disorders of peritoneum in infectious diseases classified elsewhere) -K70 (alcoholic liver disease) -K71 (toxic liver disease) -K72 (hepatic failure, not elsewhere classified) -K73 (chronic hepatitis, not elsewhere classified) -K74 (fibrosis and cirrhosis of liver) -K75 (other inflammatory liver diseases) -K76 (other diseases of liver) -K77 (liver disorders in diseases classified elsewhere) -K80 (cholelithiasis) -K81 (cholecystitis) -K82 (other diseases of gallbladder) -K83 (other diseases of biliary tract) -K85 (acute pancreatitis) -K86 (other diseases of pancreas) -K87 (disorders of gallbladder, biliary tract and pancreas in diseases classified elsewhere) -K90 (intestinal malabsorption) -K91 (postprocedural disorders of digestive system, not elsewhere classified) -K92 (other diseases of digestive system) -K93 (disorders of other digestive organs in diseases classified elsewhere) -L00 (staphylococcal scalded skin syndrome) -L01 (impetigo) -L02 (cutaneous abscess, furuncle and carbuncle) -L03 (cellulitis) -L04 (acute lymphadenitis) -L05 (pilonidal cyst) -L08 (other local infections of skin and subcutaneous tissue) -L10 (pemphigus) -L11 (other acantholytic disorders) -L12 (pemphigoid) -L13 (other bullous disorders) -L14 (bullous disorders in diseases classified elsewhere) -L20 (atopic dermatitis) -L21 (seborrhoeic dermatitis) -L22 (diaper [napkin] dermatitis) -L23 (allergic contact dermatitis) -L24 (irritant contact dermatitis) -L25 (unspecified contact dermatitis) -L26 (exfoliative dermatitis) -L27 (dermatitis due to substances taken internally) -L28 (lichen simplex chronicus and prurigo) -L29 (pruritus) -L30 (other dermatitis) -L40 (psoriasis) -L41 (parapsoriasis) -L42 (pityriasis rosea) -L43 (lichen planus) -L44 (other papulosquamous disorders) -L50 (urticaria) -L51 (erythema multiforme) -L52 (erythema nodosum) -L53 (other erythematous conditions) -L54 (erythema in diseases classified elsewhere) -L55 (sunburn) -L56 (other acute skin changes due to ultraviolet radiation) -L57 (skin changes due to chronic exposure to nonionising radiation) -L58 (radiodermatitis) -L59 (other disorders of skin and subcutaneous tissue related to radiation) -L60 (nail disorders) -L62 (nail disorders in diseases classified elsewhere) -L63 (alopecia areata) -L64 (androgenic alopecia) -L65 (other nonscarring hair loss) -L66 (cicatricial alopecia [scarring hair loss]) -L67 (hair colour and hair shaft abnormalities) -L68 (hypertrichosis) -L70 (acne) -L71 (rosacea) -L72 (follicular cysts of skin and subcutaneous tissue) -L73 (other follicular disorders) -L74 (eccrine sweat disorders) -L75 (apocrine sweat disorders) -L80 (vitiligo) -L81 (other disorders of pigmentation) -L82 (seborrhoeic keratosis) -L83 (acanthosis nigricans) -L84 (corns and callosities) -L85 (other epidermal thickening) -L86 (keratoderma in diseases classified elsewhere) -L87 (transepidermal elimination disorders) -L88 (pyoderma gangrenosum) -L89 (decubitus ulcer) -L90 (atrophic disorders of skin) -L91 (hypertrophic disorders of skin) -L92 (granulomatous disorders of skin and subcutaneous tissue) -L93 (lupus erythematosus) -L94 (other localised connective tissue disorders) -L95 (vasculitis limited to skin, not elsewhere classified) -L97 (ulcer of lower limb, not elsewhere classified) -L98 (other disorders of skin and subcutaneous tissue, not elsewhere classified) -L99 (other disorders of skin and subcutaneous tissue in diseases classified elsewhere) -M00 (pyogenic arthritis) -M01 (direct infections of joint in infectious and parasitic diseases classified elsewhere) -M02 (reactive arthropathies) -M03 (postinfective and reactive arthropathies in diseases classified elsewhere) -M05 (seropositive rheumatoid arthritis) -M06 (other rheumatoid arthritis) -M07 (psoriatic and enteropathic arthropathies) -M08 (juvenile arthritis) -M09 (juvenile arthritis in diseases classified elsewhere) -M10 (gout) -M11 (other crystal arthropathies) -M12 (other specific arthropathies) -M13 (other arthritis) -M14 (arthropathies in other diseases classified elsewhere) -M15 (polyarthrosis) -M16 (coxarthrosis [arthrosis of hip]) -M17 (gonarthrosis [arthrosis of knee]) -M18 (arthrosis of first carpometacarpal joint) -M19 (other arthrosis) -M20 (acquired deformities of fingers and toes) -M21 (other acquired deformities of limbs) -M22 (disorders of patella) -M23 (internal derangement of knee) -M24 (other specific joint derangements) -M25 (other joint disorders, not elsewhere classified) -M30 (polyarteritis nodosa and related conditions) -M31 (other necrotising vasculopathies) -M32 (systemic lupus erythematosus) -M33 (dermatopolymyositis) -M34 (systemic sclerosis) -M35 (other systemic involvement of connective tissue) -M36 (systemic disorders of connective tissue in diseases classified elsewhere) -M40 (kyphosis and lordosis) -M41 (scoliosis) -M42 (spinal osteochondrosis) -M43 (other deforming dorsopathies) -M45 (ankylosing spondylitis) -M46 (other inflammatory spondylopathies) -M47 (spondylosis) -M48 (other spondylopathies) -M49 (spondylopathies in diseases classified elsewhere) -M50 (cervical disk disorders) -M51 (other intervertebral disk disorders) -M53 (other dorsopathies, not elsewhere classified) -M54 (dorsalgia) -M60 (myositis) -M61 (calcification and ossification of muscle) -M62 (other disorders of muscle) -M63 (disorders of muscle in diseases classified elsewhere) -M65 (synovitis and tenosynovitis) -M66 (spontaneous rupture of synovium and tendon) -M67 (other disorders of synovium and tendon) -M68 (disorders of synovium and tendon in diseases classified elsewhere) -M70 (soft tissue disorders related to use, overuse and pressure) -M71 (other bursopathies) -M72 (fibroblastic disorders) -M73 (soft tissue disorders in diseases classified elsewhere) -M75 (shoulder lesions) -M76 (enthesopathies of lower limb, excluding foot) -M77 (other enthesopathies) -M79 (other soft tissue disorders, not elsewhere classified) -M80 (osteoporosis with pathological fracture) -M81 (osteoporosis without pathological fracture) -M82 (osteoporosis in diseases classified elsewhere) -M83 (adult osteomalacia) -M84 (disorders of continuity of bone) -M85 (other disorders of bone density and structure) -M86 (osteomyelitis) -M87 (osteonecrosis) -M88 (paget's disease of bone [osteitis deformans]) -M89 (other disorders of bone) -M90 (osteopathies in diseases classified elsewhere) -M91 (juvenile osteochondrosis of hip and pelvis) -M92 (other juvenile osteochondrosis) -M93 (other osteochondropathies) -M94 (other disorders of cartilage) -M95 (other acquired deformities of musculoskeletal system and connective tissue) -M96 (postprocedural musculoskeletal disorders, not elsewhere classified) -M99 (biomechanical lesions, not elsewhere classified) -N00 (acute nephritic syndrome) -N01 (rapidly progressive nephritic syndrome) -N02 (recurrent and persistent haematuria) -N03 (chronic nephritic syndrome) -N04 (nephrotic syndrome) -N05 (unspecified nephritic syndrome) -N06 (isolated proteinuria with specified morphological lesion) -N07 (hereditary nephropathy, not elsewhere classified) -N08 (glomerular disorders in diseases classified elsewhere) -N10 (acute tubulo-interstitial nephritis) -N11 (chronic tubulo-interstitial nephritis) -N12 (tubulo-interstitial nephritis, not specified as acute or chronic) -N13 (obstructive and reflux uropathy) -N14 (drug- and heavy-metal-induced tubulo-interstitial and tubular conditions) -N15 (other renal tubulo-interstitial diseases) -N16 (renal tubulo-interstitial disorders in diseases classified elsewhere) -N17 (acute renal failure) -N18 (chronic renal failure) -N19 (unspecified renal failure) -N20 (calculus of kidney and ureter) -N21 (calculus of lower urinary tract) -N22 (calculus of urinary tract in diseases classified elsewhere) -N23 (unspecified renal colic) -N25 (disorders resulting from impaired renal tubular function) -N26 (unspecified contracted kidney) -N27 (small kidney of unknown cause) -N28 (other disorders of kidney and ureter, not elsewhere classified) -N29 (other disorders of kidney and ureter in diseases classified elsewhere) -N30 (cystitis) -N31 (neuromuscular dysfunction of bladder, not elsewhere classified) -N32 (other disorders of bladder) -N33 (bladder disorders in diseases classified elsewhere) -N34 (urethritis and urethral syndrome) -N35 (urethral stricture) -N36 (other disorders of urethra) -N37 (urethral disorders in diseases classified elsewhere) -N39 (other disorders of urinary system) -N40 (hyperplasia of prostate) -N41 (inflammatory diseases of prostate) -N42 (other disorders of prostate) -N43 (hydrocele and spermatocele) -N44 (torsion of testis) -N45 (orchitis and epididymitis) -N46 (male infertility) -N47 (redundant prepuce, phimosis and paraphimosis) -N48 (other disorders of penis) -N49 (inflammatory disorders of male genital organs, not elsewhere classified) -N50 (other disorders of male genital organs) -N51 (disorders of male genital organs in diseases classified elsewhere) -N60 (benign mammary dysplasia) -N61 (inflammatory disorders of breast) -N62 (hypertrophy of breast) -N63 (unspecified lump in breast) -N64 (other disorders of breast) -N70 (salpingitis and oophoritis) -N71 (inflammatory disease of uterus, except cervix) -N72 (inflammatory disease of cervix uteri) -N73 (other female pelvic inflammatory diseases) -N74 (female pelvic inflammatory disorders in diseases classified elsewhere) -N75 (diseases of bartholin's gland) -N76 (other inflammation of vagina and vulva) -N77 (vulvovaginal ulceration and inflammation in diseases classified elsewhere) -N80 (endometriosis) -N81 (female genital prolapse) -N82 (fistulae involving female genital tract) -N83 (noninflammatory disorders of ovary, fallopian tube and broad ligament) -N84 (polyp of female genital tract) -N85 (other noninflammatory disorders of uterus, except cervix) -N86 (erosion and ectropion of cervix uteri) -N87 (dysplasia of cervix uteri) -N88 (other noninflammatory disorders of cervix uteri) -N89 (other noninflammatory disorders of vagina) -N90 (other noninflammatory disorders of vulva and perineum) -N91 (absent, scanty and rare menstruation) -N92 (excessive, frequent and irregular menstruation) -N93 (other abnormal uterine and vaginal bleeding) -N94 (pain and other conditions associated with female genital organs and menstrual cycle) -N95 (menopausal and other perimenopausal disorders) -N96 (habitual aborter) -N97 (female infertility) -N98 (complications associated with artificial fertilisation) -N99 (postprocedural disorders of genito-urinary system, not elsewhere classified) -O00 (ectopic pregnancy) -O01 (hydatidiform mole) -O02 (other abnormal products of conception) -O03 (spontaneous abortion) -O04 (medical abortion) -O05 (other abortion) -O06 (unspecified abortion) -O07 (failed attempted abortion) -O08 (complications following abortion and ectopic and molar pregnancy) -O10 (pre-existing hypertension complicating pregnancy, childbirth and the puerperium) -O11 (pre-existing hypertensive disorder with superimposed proteinuria) -O12 (gestational [pregnancy-induced] oedema and proteinuria without hypertension) -O13 (gestational [pregnancy-induced] hypertension without significant proteinuria) -O14 (gestational [pregnancy-induced] hypertension with significant proteinuria) -O15 (eclampsia) -O16 (unspecified maternal hypertension) -O20 (haemorrhage in early pregnancy) -O21 (excessive vomiting in pregnancy) -O22 (venous complications in pregnancy) -O23 (infections of genito-urinary tract in pregnancy) -O24 (diabetes mellitus in pregnancy) -O25 (malnutrition in pregnancy) -O26 (maternal care for other conditions predominantly related to pregnancy) -O28 (abnormal findings on antenatal screening of mother) -O29 (complications of anaesthesia during pregnancy) -O30 (multiple gestation) -O31 (complications specific to multiple gestation) -O32 (maternal care for known or suspected malpresentation of foetus) -O33 (maternal care for known or suspected disproportion) -O34 (maternal care for known or suspected abnormality of pelvic organs) -O35 (maternal care for known or suspected foetal abnormality and damage) -O36 (maternal care for other known or suspected foetal problems) -O40 (polyhydramnios) -O41 (other disorders of amniotic fluid and membranes) -O42 (premature rupture of membranes) -O43 (placental disorders) -O44 (placenta praevia) -O45 (premature separation of placenta [abruptio placentae]) -O46 (antepartum haemorrhage, not elsewhere classified) -O47 (false labour) -O48 (prolonged pregnancy) -O60 (preterm delivery) -O61 (failed induction of labour) -O62 (abnormalities of forces of labour) -O63 (long labour) -O64 (obstructed labour due to malposition and malpresentation of foetus) -O65 (obstructed labour due to maternal pelvic abnormality) -O66 (other obstructed labour) -O67 (labour and delivery complicated by intrapartum haemorrhage, not elsewhere classified) -O68 (labour and delivery complicated by foetal stress [distress]) -O69 (labour and delivery complicated by umbilical cord complications) -O70 (perineal laceration during delivery) -O71 (other obstetric trauma) -O72 (postpartum haemorrhage) -O73 (retained placenta and membranes, without haemorrhage) -O74 (complications of anaesthesia during labour and delivery) -O75 (other complications of labour and delivery, not elsewhere classified) -O80 (single spontaneous delivery) -O81 (single delivery by forceps and vacuum extractor) -O82 (single delivery by caesarean section) -O83 (other assisted single delivery) -O84 (multiple delivery) -O85 (puerperal sepsis) -O86 (other puerperal infections) -O87 (venous complications in the puerperium) -O88 (obstetric embolism) -O89 (complications of anaesthesia during the puerperium) -O90 (complications of the puerperium, not elsewhere classified) -O91 (infections of breast associated with childbirth) -O92 (other disorders of breast and lactation associated with childbirth) -O94 (sequelae of complication of pregnancy, childbirth and the puerperium) -O96 (death from any obstetric cause occurring more than 42 days but less than one year after delivery) -O98 (maternal infectious and parasitic diseases classifiable elsewhere but complicating pregnancy, childbirth and the puerperium) -O99 (other maternal diseases classifiable elsewhere but complicating pregnancy, childbirth and the puerperium) -P00 (foetus and newborn affected by maternal conditions that may be unrelated to present pregnancy) -P02 (foetus and newborn affected by complications of placenta, cord and membranes) -P03 (foetus and newborn affected by other complications of labour and delivery) -P04 (foetus and newborn affected by noxious influences transmitted via placenta or breast milk) -P05 (slow foetal growth and foetal malnutrition) -P07 (disorders related to short gestation and low birth weight, not elsewhere classified) -P08 (disorders related to long gestation and high birth weight) -P10 (intracranial laceration and haemorrhage due to birth injury) -P11 (other birth injuries to central nervous system) -P12 (birth injury to scalp) -P13 (birth injury to skeleton) -P14 (birth injury to peripheral nervous system) -P15 (other birth injuries) -P20 (intra-uterine hypoxia) -P21 (birth asphyxia) -P22 (respiratory distress of newborn) -P23 (congenital pneumonia) -P24 (neonatal aspiration syndromes) -P25 (interstitial emphysema and related conditions originating in the perinatal period) -P26 (pulmonary haemorrhage originating in the perinatal period) -P27 (chronic respiratory disease originating in the perinatal period) -P28 (other respiratory conditions originating in the perinatal period) -P29 (cardiovascular disorders originating in the perinatal period) -P35 (congenital viral diseases) -P36 (bacterial sepsis of newborn) -P37 (other congenital infectious and parasitic diseases) -P38 (omphalitis of newborn with or without mild haemorrhage) -P39 (other infections specific to the perinatal period) -P50 (foetal blood loss) -P51 (umbilical haemorrhage of newborn) -P52 (intracranial nontraumatic haemorrhage of foetus and newborn) -P53 (haemorrhagic disease of foetus and newborn) -P54 (other neonatal haemorrhages) -P55 (haemolytic disease of foetus and newborn) -P58 (neonatal jaundice due to other excessive haemolysis) -P59 (neonatal jaundice from other and unspecified causes) -P61 (other perinatal haematological disorders) -P70 (transitory disorders of carbohydrate metabolism specific to foetus and newborn) -P71 (transitory neonatal disorders of calcium and magnesium metabolism) -P78 (other perinatal digestive system disorders) -P83 (other conditions of integument specific to foetus and newborn) -P91 (other disturbances of cerebral status of newborn) -P92 (feeding problems of newborn) -P94 (disorders of muscle tone of newborn) -P95 (foetal death of unspecified cause) -P96 (other conditions originating in the perinatal period) -Q00 (anencephaly and similar malformations) -Q01 (encephalocele) -Q02 (microcephaly) -Q03 (congenital hydrocephalus) -Q04 (other congenital malformations of brain) -Q05 (spina bifida) -Q06 (other congenital malformations of spinal cord) -Q07 (other congenital malformations of nervous system) -Q10 (congenital malformations of eyelid, lachrymal apparatus and orbit) -Q11 (anophthalmos, microphthalmos and macrophthalmos) -Q12 (congenital lens malformations) -Q13 (congenital malformations of anterior segment of eye) -Q14 (congenital malformations of posterior segment of eye) -Q15 (other congenital malformations of eye) -Q16 (congenital malformations of ear causing impairment of hearing) -Q17 (other congenital malformations of ear) -Q18 (other congenital malformations of face and neck) -Q20 (congenital malformations of cardiac chambers and connexions) -Q21 (congenital malformations of cardiac septa) -Q22 (congenital malformations of pulmonary and tricuspid valves) -Q23 (congenital malformations of aortic and mitral valves) -Q24 (other congenital malformations of heart) -Q25 (congenital malformations of great arteries) -Q26 (congenital malformations of great veins) -Q27 (other congenital malformations of peripheral vascular system) -Q28 (other congenital malformations of circulatory system) -Q30 (congenital malformations of nose) -Q31 (congenital malformations of larynx) -Q32 (congenital malformations of trachea and bronchus) -Q33 (congenital malformations of lung) -Q34 (other congenital malformations of respiratory system) -Q35 (cleft palate) -Q36 (cleft lip) -Q37 (cleft palate with cleft lip) -Q38 (other congenital malformations of tongue, mouth and pharynx) -Q39 (congenital malformations of oesophagus) -Q40 (other congenital malformations of upper alimentary tract) -Q41 (congenital absence, atresia and stenosis of small intestine) -Q42 (congenital absence, atresia and stenosis of large intestine) -Q43 (other congenital malformations of intestine) -Q44 (congenital malformations of gallbladder, bile ducts and liver) -Q45 (other congenital malformations of digestive system) -Q50 (congenital malformations of ovaries, fallopian tubes and broad ligaments) -Q51 (congenital malformations of uterus and cervix) -Q52 (other congenital malformations of female genitalia) -Q53 (undescended testicle) -Q54 (hypospadias) -Q55 (other congenital malformations of male genital organs) -Q56 (indeterminate sex and pseudohermaphroditism) -Q60 (renal agenesis and other reduction defects of kidney) -Q61 (cystic kidney disease) -Q62 (congenital obstructive defects of renal pelvis and congenital malformations of ureter) -Q63 (other congenital malformations of kidney) -Q64 (other congenital malformations of urinary system) -Q65 (congenital deformities of hip) -Q66 (congenital deformities of feet) -Q67 (congenital musculoskeletal deformities of head, face, spine and chest) -Q68 (other congenital musculoskeletal deformities) -Q69 (polydactyly) -Q70 (syndactyly) -Q71 (reduction defects of upper limb) -Q72 (reduction defects of lower limb) -Q73 (reduction defects of unspecified limb) -Q74 (other congenital malformations of limb(s)) -Q75 (other congenital malformations of skull and face bones) -Q76 (congenital malformations of spine and bony thorax) -Q77 (osteochondrodysplasia with defects of growth of tubular bones and spine) -Q78 (other osteochondrodysplasias) -Q79 (congenital malformations of musculoskeletal system, not elsewhere classified) -Q80 (congenital ichthyosis) -Q81 (epidermolysis bullosa) -Q82 (other congenital malformations of skin) -Q83 (congenital malformations of breast) -Q84 (other congenital malformations of integument) -Q85 (phakomatoses, not elsewhere classified) -Q86 (congenital malformation syndromes due to known exogenous causes, not elsewhere classified) -Q87 (other specified congenital malformation syndromes affecting multiple systems) -Q89 (other congenital malformations, not elsewhere classified) -Q90 (down's syndrome) -Q91 (edwards' syndrome and patau's syndrome) -Q92 (other trisomies and partial trisomies of the autosomes, not elsewhere classified) -Q93 (monosomies and deletions from the autosomes, not elsewhere classified) -Q95 (balanced rearrangements and structural markers, not elsewhere classified) -Q96 (turner's syndrome) -Q97 (other sex chromosome abnormalities, female phenotype, not elsewhere classified) -Q98 (other sex chromosome abnormalities, male phenotype, not elsewhere classified) -Q99 (other chromosome abnormalities, not elsewhere classified) -CXX Unknown Cancer -C00 Malignant neoplasm of lip -C01 Malignant neoplasm of base of tongue -C02 Malignant neoplasm of other and unspecified parts of tongue -C03 Malignant neoplasm of gum -C04 Malignant neoplasm of floor of mouth -C05 Malignant neoplasm of palate -C06 Malignant neoplasm of other and unspecified parts of mouth -C07 Malignant neoplasm of parotid gland -C08 Malignant neoplasm of other and unspecified major salivary glands -C09 Malignant neoplasm of tonsil -C10 Malignant neoplasm of oropharynx -C11 Malignant neoplasm of nasopharynx -C12 Malignant neoplasm of pyriform sinus -C13 Malignant neoplasm of hypopharynx -C14 Malignant neoplasm of other and ill-defined sites in the lip, oral cavity and pharynx -C15 Malignant neoplasm of oesophagus -C16 Malignant neoplasm of stomach -C17 Malignant neoplasm of small intestine -C18 Malignant neoplasm of colon -C19 Malignant neoplasm of rectosigmoid junction -C20 Malignant neoplasm of rectum -C21 Malignant neoplasm of anus and anal canal -C22 Malignant neoplasm of liver and intrahepatic bile ducts -C23 Malignant neoplasm of gallbladder -C24 Malignant neoplasm of other and unspecified parts of biliary tract -C25 Malignant neoplasm of pancreas -C26 Malignant neoplasm of other and ill-defined digestive organs -C30 Malignant neoplasm of nasal cavity and middle ear -C31 Malignant neoplasm of accessory sinuses -C32 Malignant neoplasm of larynx -C33 Malignant neoplasm of trachea -C34 Malignant neoplasm of bronchus and lung -C37 Malignant neoplasm of thymus -C38 Malignant neoplasm of heart, mediastinum and pleura -C39 Malignant neoplasm of other and ill-defined sites in the respiratory system and intrathoracic organs -C40 Malignant neoplasm of bone and articular cartilage of limbs -C41 Malignant neoplasm of bone and articular cartilage of other and unspecified sites -C42 hematopoietic and reticuloendothelial systems (ICD-O-3 specific) -C43 Malignant melanoma of skin -C44 Other malignant neoplasms of skin -C45 Mesothelioma -C46 Kaposi's sarcoma -C47 Malignant neoplasm of peripheral nerves and autonomic nervous system -C48 Malignant neoplasm of retroperitoneum and peritoneum -C49 Malignant neoplasm of other connective and soft tissue -C50 Malignant neoplasm of breast -C51 Malignant neoplasm of vulva -C52 Malignant neoplasm of vagina -C53 Malignant neoplasm of cervix uteri -C54 Malignant neoplasm of corpus uteri -C55 Malignant neoplasm of uterus, part unspecified -C56 Malignant neoplasm of ovary -C57 Malignant neoplasm of other and unspecified female genital organs -C58 Malignant neoplasm of placenta -C60 Malignant neoplasm of penis -C61 Malignant neoplasm of prostate -C62 Malignant neoplasm of testis -C63 Malignant neoplasm of other and unspecified male genital organs -C64 Malignant neoplasm of kidney, except renal pelvis -C65 Malignant neoplasm of renal pelvis -C66 Malignant neoplasm of ureter -C67 Malignant neoplasm of bladder -C68 Malignant neoplasm of other and unspecified urinary organs -C69 Malignant neoplasm of eye and adnexa -C70 Malignant neoplasm of meninges -C71 Malignant neoplasm of brain -C72 Malignant neoplasm of spinal cord, cranial nerves and other parts of central nervous system -C73 Malignant neoplasm of thyroid gland -C74 Malignant neoplasm of adrenal gland -C75 Malignant neoplasm of other endocrine glands and related structures -C76 Malignant neoplasm of other and ill-defined sites -C77 Secondary and unspecified malignant neoplasm of lymph nodes -C78 Secondary malignant neoplasm of respiratory and digestive organs -C79 Secondary malignant neoplasm of other sites -C80 Malignant neoplasm without specification of site -C81 Hodgkin's disease -C82 Follicular [nodular] non-Hodgkin's lymphoma -C83 Diffuse non-Hodgkin's lymphoma -C84 Peripheral and cutaneous T-cell lymphomas -C85 Other and unspecified types of non-Hodgkin's lymphoma -C86 Other specified types of T/NK-cell lymphoma -C88 Malignant immunoproliferative diseases -C90 Multiple myeloma and malignant plasma cell neoplasms -C91 Lymphoid leukaemia -C92 Myeloid leukaemia -C93 Monocytic leukaemia -C94 Other leukaemias of specified cell type -C95 Leukaemia of unspecified cell type -C96 Other and unspecified malignant neoplasms of lymphoid, haematopoietic and related tissue -C97 Malignant neoplasms of independent (primary) multiple sites -D00 Carcinoma in situ of oral cavity, oesophagus and stomach -D01 Carcinoma in situ of other and unspecified digestive organs -D02 Carcinoma in situ of middle ear and respiratory system -D03 Melanoma in situ -D04 Carcinoma in situ of skin -D05 Carcinoma in situ of breast -D06 Carcinoma in situ of cervix uteri -D07 Carcinoma in situ of other and unspecified genital organs -D09 Carcinoma in situ of other and unspecified sites -D10 Benign neoplasm of mouth and pharynx -D11 Benign neoplasm of major salivary glands -D12 Benign neoplasm of colon, rectum, anus and anal canal -D13 Benign neoplasm of other and ill-defined parts of digestive system -D15 Benign neoplasm of other and unspecified intrathoracic organs -D16 Benign neoplasm of bone and articular cartilage -D18 Haemangioma and lymphangioma, any site -D27 Benign neoplasm of ovary -D30 Benign neoplasm of urinary organs -D32 Benign neoplasm of meninges -D33 Benign neoplasm of brain and other parts of central nervous system -D34 Benign neoplasm of thyroid gland -D35 Benign neoplasm of other and unspecified endocrine glands -D36 Benign neoplasm of other and unspecified sites -D37 Neoplasm of uncertain or unknown behaviour of oral cavity and digestive organs -D38 Neoplasm of uncertain or unknown behaviour of middle ear and respiratory and intrathoracic organs -D39 Neoplasm of uncertain or unknown behaviour of female genital organs -D40 Neoplasm of uncertain or unknown behaviour of male genital organs -D41 Neoplasm of uncertain or unknown behaviour of urinary organs -D42 Neoplasm of uncertain or unknown behaviour of meninges -D43 Neoplasm of uncertain or unknown behaviour of brain and central nervous system -D44 Neoplasm of uncertain or unknown behaviour of endocrine glands -D45 Polycythaemia vera -D46 Myelodysplastic syndromes -D47 Other neoplasms of uncertain or unknown behaviour of lymphoid, haematopoietic and related tissue -D48 Neoplasm of uncertain or unknown behaviour of other and unspecified sites -O01 Hydatidiform mole38 -Death diff --git a/delphi_fork/prepare_data.py b/delphi_fork/prepare_data.py deleted file mode 100644 index 5f47fd4..0000000 --- a/delphi_fork/prepare_data.py +++ /dev/null @@ -1,216 +0,0 @@ -import pandas as pd # Pandas for data manipulation -import tqdm # Progress bar for chunk processing -import numpy as np # Numerical operations - -train_frac = 0.7 # Fraction of participants for training split -val_frac = 0.15 # Fraction of participants for validation split -test_frac = 0.15 # Fraction of participants for test split - -# CSV mapping field IDs to human-readable names -field_map_file = "../field_ids_enriched.csv" -field_dict = {} # Map original field ID -> new column name -with open(field_map_file, "r", encoding="utf-8") as f: # Open the field mapping file - next(f) # skip header line - for line in f: # Iterate through lines - parts = line.strip().split(",") # Split by CSV commas - if len(parts) >= 3: # Ensure we have at least id and name columns (fix: was >=2) - # Original field identifier (e.g., "34-0.0") - field_id = parts[0] - field_name = parts[2] # Human-readable column name - field_dict[field_id] = field_name # Record the mapping - # Track as a potential tabular feature - - -# TSV mapping field IDs to ICD10-related date columns -field_to_icd_map = "../icd10_codes_mod.tsv" -# Date-like variables to be converted to offsets -date_vars = [] -with open(field_to_icd_map, "r", encoding="utf-8") as f: # Open ICD10 mapping - for line in f: # Iterate each mapping row - parts = line.strip().split() # Split on whitespace for TSV - if len(parts) >= 6: # Guard against malformed lines - # Map field ID to the date column name - field_dict[parts[0]] = parts[5] - date_vars.append(parts[5]) # Track date column names in order - -for j in range(17): # Map up to 17 cancer entry slots (dates and types) - # Cancer diagnosis date slot j - field_dict[f'40005-{j}.0'] = f'cancer_date_{j}' - field_dict[f'40006-{j}.0'] = f'cancer_type_{j}' # Cancer type/code slot j - -# Number of ICD-related date columns before adding extras -len_icd = len(date_vars) -date_vars.extend(['Death', 'date_of_assessment'] + # Add outcome date and assessment date - # Add cancer date columns - [f'cancer_date_{j}' for j in range(17)]) - -labels_file = "labels.csv" # File listing label codes -label_dict = {} # Map code string -> integer label id -with open(labels_file, "r", encoding="utf-8") as f: # Open labels file - for idx, line in enumerate(f): # Enumerate to assign incremental label IDs - parts = line.strip().split(' ') # Split by space - if parts and parts[0]: # Guard against empty lines - label_dict[parts[0]] = idx - -event_list = [] # Accumulator for event arrays across chunks -ukb_iterator = pd.read_csv( # Stream UK Biobank data in chunks - "../ukb_data.csv", - sep=',', - chunksize=10000, # Stream file in manageable chunks to reduce memory footprint - # First column (participant ID) becomes DataFrame index - index_col=0, - low_memory=False # Disable type inference optimization for consistent dtypes -) -# Iterate chunks with progress -for ukb_chunk in tqdm.tqdm(ukb_iterator, desc="Processing UK Biobank data"): - # Rename columns to friendly names - ukb_chunk = ukb_chunk.rename(columns=field_dict) - # Require sex to be present - ukb_chunk.dropna(subset=['sex'], inplace=True) - ukb_chunk['sex'] += 2 # Recode sex: 0-> 2, 1 -> 3 - - # Construct date of birth from year and month (day fixed to 1) - ukb_chunk['dob'] = pd.to_datetime( - # Guard against malformed dates - ukb_chunk[['year', 'month']].assign(DAY=1), errors='coerce' - ) - - # Use only date variables that actually exist in the current chunk - present_date_vars = [c for c in date_vars if c in ukb_chunk.columns] - - # Convert date-like columns to datetime and compute day offsets from dob - if present_date_vars: - date_cols = ukb_chunk[present_date_vars].apply( - pd.to_datetime, format="%Y-%m-%d", errors='coerce' # Parse dates safely - ) - date_cols_days = date_cols.sub( - ukb_chunk['dob'], axis=0) # Timedelta relative to dob - ukb_chunk[present_date_vars] = date_cols_days.apply( - lambda x: x.dt.days) # Store days since dob - - # Process disease events from ICD10-related date columns - # Take ICD date cols plus 'Death' if present by order - icd10_cols = present_date_vars[:len_icd + 1] - # Melt to long form: participant id, event code (column name), and days offset - melted_df = ukb_chunk.reset_index().melt( - id_vars=['eid'], - value_vars=icd10_cols, - var_name='event_code', - value_name='days', - ) - # Require non-missing day offsets - melted_df.dropna(subset=['days'], inplace=True) - if not melted_df.empty: - melted_df['label'] = melted_df['event_code'].map( - label_dict) # Map event code to numeric label - # Fix: ensure labels exist before int cast - melted_df.dropna(subset=['label'], inplace=True) - if not melted_df.empty: - event_list.append( - melted_df[['eid', 'days', 'label']] - .astype(int) # Safe now since label and days are non-null - .to_numpy() - ) - - df_res = ukb_chunk.reset_index() # Bring participant ID out of index - # Simplify stub names for wide_to_long - # Rename date stubs - rename_dict = {f'cancer_date_{j}': f'cancerdate{j}' for j in range(17)} - rename_dict.update( - # Rename type stubs - {f'cancer_type_{j}': f'cancertype{j}' for j in range(17)}) - df_renamed = df_res.rename(columns=rename_dict) # Apply renaming - stubs_to_use = [] # Collect available stubs - if any('cancerdate' in col for col in df_renamed.columns): - stubs_to_use.append('cancerdate') # Date stub present - if any('cancertype' in col for col in df_renamed.columns): - stubs_to_use.append('cancertype') # Type stub present - - if len(stubs_to_use) == 2: # Only proceed if both date and type columns exist - long_cancer = pd.wide_to_long( - df_renamed, - stubnames=stubs_to_use, - i=['eid'], # Participant ID identifier - j='cancer_num' # Index over cancer record number (0..16) - ).dropna() # Remove rows missing either date or type - if not long_cancer.empty: - long_cancer['cancer'] = long_cancer['cancertype'].str.slice( - 0, 3) # Use first 3 chars as code - long_cancer['cancer_label'] = long_cancer['cancer'].map( - label_dict) # Map to label id - cancer_array = ( - long_cancer.reset_index( - )[['eid', 'cancerdate', 'cancer_label']] - .dropna() - .astype(int) - .to_numpy() - ) - if cancer_array.size > 0: - event_list.append(cancer_array) # Append cancer events - - # Process BMI, smoking, and alcohol status - ukb_bmi = ukb_chunk[['date_of_assessment', 'bmi']].dropna().reset_index() - if not ukb_bmi.empty: - ukb_bmi['bmi_status'] = np.select( - [ukb_bmi['bmi'] > 28, ukb_bmi['bmi'] > 22], - [6, 5], - default=4 - ) - event_list.append( - ukb_bmi[['eid', 'date_of_assessment', 'bmi_status']] - .astype(int) - .to_numpy() - ) - - ukb_sm = ukb_chunk[['date_of_assessment', 'smoking']].dropna().reset_index() - ukb_sm = ukb_sm[ukb_sm['smoking'] != -3] # Exclude unknown smoking status - if not ukb_sm.empty: - ukb_sm['smoking_status'] = np.select( - [ukb_sm['smoking'] == 1, ukb_sm['smoking'] == 2], - [9, 8], - default=7 - ) - event_list.append( - ukb_sm[['eid', 'date_of_assessment', 'smoking_status']] - .astype(int) - .to_numpy() - ) - ukb_al = ukb_chunk[['date_of_assessment', 'alcohol']].dropna().reset_index() - ukb_al = ukb_al[ukb_al['alcohol'] != -3] # Exclude unknown alcohol status - if not ukb_al.empty: - ukb_al['alcohol_status'] = np.select( - [ukb_al['alcohol'] == 1, ukb_al['alcohol'] < 4], - [12, 11], - default=10 - ) - event_list.append( - ukb_al[['eid', 'date_of_assessment', 'alcohol_status']] - .astype(int) - .to_numpy() - ) - -# Combine tabular chunks - -data = np.vstack(event_list) # Stack all event arrays into one - -# Sort by participant then day -data = data[np.lexsort((data[:, 1], data[:, 0]))] - -# Keep only events with non-negative day offsets -data = data[data[:, 1] >= 0] - -# Remove duplicate (participant_id, label) pairs keeping first occurrence. -data = pd.DataFrame(data).drop_duplicates([0, 2]).values - -# Store compactly using unsigned 32-bit integers -data = data.astype(np.uint32) - -# Split data into train/val/test based on unique participant IDs -unique_ids = np.unique(data[:, 0]) # Unique participant IDs -train_split_id = unique_ids[int(len(unique_ids) * train_frac)] -val_split_id = unique_ids[int(len(unique_ids) * (train_frac + val_frac))] - -train_data = data[data[:, 0] <= train_split_id].tofile("ukb_real_train.bin") -val_data = data[(data[:, 0] > train_split_id) & ( - data[:, 0] <= val_split_id)].tofile("ukb_real_val.bin") -test_data = data[data[:, 0] > val_split_id].tofile("ukb_real_test.bin") diff --git a/losses.py b/losses.py new file mode 100644 index 0000000..85e93ff --- /dev/null +++ b/losses.py @@ -0,0 +1,112 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class ExponentialNLLLoss(nn.Module): + + def __init__( + self, + n_tech_tokens: int, + alpha: float = 0.1, + ): + super().__init__() + self.n_tech_tokens = n_tech_tokens + self.alpha = alpha + + def forward( + self, + logits: torch.Tensor, + event_seqs: torch.Tensor, + time_seqs: torch.Tensor, + ) -> torch.Tensor: + # Calculate the negative log-likelihood for the exponential distribution + + # 1, shift event_seqs to remove technical tokens + target_event_seqs = event_seqs[:, 1:] - self.n_tech_tokens + mask = target_event_seqs >= 0 + # 2, create a mask to filter out technical tokens + if not mask.any(): + # if there are no valid events, return zero loss + return logits.new_zeros(()) + + # 3, compute time differences + dt = time_seqs[:, 1:] - time_seqs[:, :-1] + dt = dt[mask] # (N,) + # 4, filter target events + target_events = target_event_seqs[mask] # (N,) + # 5, compute hazard and total hazard + hazard = logits[:, :-1, :] # (B, L-1, vocab_size) + hazard_at_events = hazard[mask].gather( + dim=-1, index=target_events.unsqueeze(-1)).squeeze(-1) # (N,) + total_hazard = hazard[mask].sum(dim=-1) # (N,) + # 6, compute negative log-likelihood + nll = torch.log(hazard_at_events + 1e-6) - total_hazard * dt + nll = -nll.mean() + # 7, compute cross-entropy regularization + p_ce = hazard_at_events / total_hazard + regularization = -self.alpha * torch.log(p_ce + 1e-6).mean() + + return nll + regularization + + +class WeibullLosses(nn.Module): + + def __init__( + self, + n_tech_tokens: int, + alpha: float = 0.1, + ): + super().__init__() + self.n_tech_tokens = n_tech_tokens + self.alpha = alpha + + def forward( + self, + shapes: torch.Tensor, + scales: torch.Tensor, + event_seqs: torch.Tensor, + time_seqs: torch.Tensor, + ) -> torch.Tensor: + # Calculate the negative log-likelihood for the Weibull distribution + + # 1, shift event_seqs to remove technical tokens + target_event_seqs = event_seqs[:, 1:] - self.n_tech_tokens + mask = target_event_seqs >= 0 + # 2, create a mask to filter out technical tokens + if not mask.any(): + # if there are no valid events, return zero loss + return shapes.new_zeros(()) + + # 3, compute time differences + dt = time_seqs[:, 1:] - time_seqs[:, :-1] + dt = dt[mask] # (N,) + # 4, filter target events + target_events = target_event_seqs[mask] # (N,) + shapes = shapes[mask] # (N, vocab_size) + scales = scales[mask] # (N, vocab_size) + # 5, compute shape and scale at events + shape_at_events = shapes.gather( + dim=-1, index=target_events.unsqueeze(-1)).squeeze(-1) # (N,) + scale_at_events = scales.gather( + dim=-1, index=target_events.unsqueeze(-1)).squeeze(-1) # (N,) + log_shapes = torch.log(shape_at_events) + log_scales = torch.log(scale_at_events) + log_dt = torch.log(dt + 1e-6) + # 6, compute negative log-likelihood + nll = log_shapes - log_scales + \ + (shape_at_events - 1) * (log_dt - log_scales) + log_tot_survival = (dt.unsqueeze(-1) / + scales) ** shapes # (N, vocab_size) + nll -= log_tot_survival.sum(dim=-1) + nll = -nll.mean() + # 7, compute cross-entropy regularization + log_shapes_all = torch.log(shapes) + log_scales_all = torch.log(scales) + log_dt_expanded = log_dt.unsqueeze(-1) + + log_hazards = log_shapes_all - log_scales_all + (shapes - 1) * \ + (log_dt_expanded - log_scales_all) # (N, vocab_size) + ce_loss = F.cross_entropy( + log_hazards, target_events, reduction='mean') + + return nll + self.alpha * ce_loss diff --git a/model.py b/model.py new file mode 100644 index 0000000..5df3936 --- /dev/null +++ b/model.py @@ -0,0 +1,129 @@ +import torch +import torch.nn as nn +from age_encoder import AgeSinusoidalEncoder, AgeMLPEncoder +from backbones import Block, ModernBlock, RMSNorm + +class TabularEncoder(nn.Module): + def __init__( + self, + n_embd: int, + n_continuous: int, + n_categorical: int, + categorical_cardinalities: list[int], + ): + super().__init__() + self.continuous_proj = nn.Linear(n_continuous, n_embd) if n_continuous > 0 else None + self.categorical_embeddings = nn.ModuleList([ + nn.Embedding(cardinality, n_embd) for cardinality in categorical_cardinalities + ]) if n_categorical > 0 else None + + def forward( + self, + continuous_features: torch.Tensor | None, + categorical_features: list[torch.Tensor] | None, + ) -> torch.Tensor: + embeddings = [] + if self.continuous_proj is not None and continuous_features is not None: + cont_emb = self.continuous_proj(continuous_features) + embeddings.append(cont_emb) + if self.categorical_embeddings is not None and categorical_features is not None: + for emb_layer, cat_feat in zip(self.categorical_embeddings, categorical_features): + cat_emb = emb_layer(cat_feat) + embeddings.append(cat_emb) + if embeddings: + return torch.sum(torch.stack(embeddings, dim=0), dim=0) + else: + raise ValueError("No features provided for TabularEncoder.") + +def merge_two_sequences( + time_seq1: torch.Tensor, # (B, L1) + time_seq2: torch.Tensor, # (B, L2) + seq1_embd: torch.Tensor, # (B, L1, D) + seq2_embd: torch.Tensor, # (B, L2, D) +) -> torch.Tensor: + """Merge two time sequences and their embeddings based on time order.""" + B, L1 = time_seq1.shape + L2 = time_seq2.shape[1] + merged_times = torch.cat([time_seq1, time_seq2], dim=1) # (B, L1 + L2) + merged_embd = torch.cat([seq1_embd, seq2_embd], dim=1) # (B, L1 + L2, D) + + sorted_times, indices = torch.sort(merged_times, dim=1) # (B, L1 + L2) + batch_indices = torch.arange(B).unsqueeze(-1).expand(-1, L1 + L2) # (B, L1 + L2) + sorted_embd = merged_embd[batch_indices, indices] # (B, L1 + L2, D) + + return sorted_times, sorted_embd + + +class DelphiFork(nn.Module): + def __init__( + self, + vocab_size: int, + n_embd: int, + n_head: int, + n_layer: int, + n_continuous: int, + n_categorical: int, + categorical_cardinalities: list[int], + pdrop: float = 0.1, + token_pdrop: float = 0.1, + ): + super().__init__() + self.token_embedding = nn.Embedding(vocab_size, n_embd) + self.age_encoder = AgeSinusoidalEncoder(n_embd=n_embd) + self.sex_encoder = nn.Embedding(2, n_embd) + self.token_dropout = nn.Dropout(token_pdrop) + self.covariate_encoder = TabularEncoder( + n_embd=n_embd, + n_continuous=n_continuous, + n_categorical=n_categorical, + categorical_cardinalities=categorical_cardinalities, + ) + + self.blocks = nn.ModuleList([ + Block( + n_embd=n_embd, + n_head=n_head, + pdrop=pdrop, + ) for _ in range(n_layer) + ]) + + self.ln_f = nn.LayerNorm(n_embd) + self.head = nn.Linear(n_embd, vocab_size, bias=False) + self.head.weight = self.token_embedding.weight + + def forward( + self, + sex: torch.Tensor, + event_seq: torch.Tensor, + age_seq: torch.Tensor, + cov_seq_time: torch.Tensor | None = None, + cont_cov_seq: torch.Tensor | None = None, + cat_cov_seq: list[torch.Tensor] | None = None, + + ) -> torch.Tensor: + + event_emb = self.token_embedding(event_seq) + age_emb = self.age_encoder(age_seq) + sex_emb = self.sex_encoder(sex.unsqueeze(-1)) # (B, 1) -> (B, 1, n_embd) + + x = event_emb + age_emb + sex_emb + if cov_seq_time is not None: + covariate_emb = self.covariate_encoder( + continuous_features=cont_cov_seq, + categorical_features=cat_cov_seq, + ) + covariate_emb = covariate_emb + self.age_encoder(cov_seq_time) + sex_emb + x = merge_two_sequences(age_seq, cov_seq_time, x, covariate_emb) + + x = self.token_dropout(x) + + for block in self.blocks: + x = block(x) + + x = self.ln_f(x) + logits = self.head(x) + return logits + + + + \ No newline at end of file