diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..ab0e4bd --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 3.15) +project(snplib LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +option(USE_MKL "Use Intel MKL" OFF) +option(USE_OPENBLAS "Use OpenBLAS" OFF) + +if(USE_MKL) + find_package(MKL REQUIRED) + if(MKL_FOUND) + include_directories(${MKL_INCLUDE_DIRS}) + link_libraries(${MKL_LIBRARIES}) + endif() +elseif(USE_OPENBLAS) + find_package(OpenBLAS REQUIRED) + if(OPENBLAS_FOUND) + include_directories(${OPENBLAS_INCLUDE_DIRS}) + link_libraries(${OPENBLAS_LIBRARIES}) + endif() +endif() + +include_directories(src) + +add_library(snplib + src/data_manage.cc + src/grm.cc + src/ibs.cc + src/king.cc + src/snp.cc + src/statistics.cc + src/ugrm.cc +) + +target_include_directories(snplib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src) + +# Example executable +add_executable(example main.cc) +target_link_libraries(example snplib) diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000..926fc4e --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,42 @@ +# Project: SNPLIB + +## 1. Overview + +This project (SNPLIB) is a high-performance computing library for Quantitative Genetics developed in C++. The core objective of the project is to leverage modern C++ features and achieve extreme numerical computation performance through optimized math libraries (such as Intel MKL or OpenBLAS). + +This project uses CMake for cross-platform build management and strictly adheres to the Google C++ Coding Style. + +## 2. Core Tech Stack & Standards + +### 2.1 Programming Language + +* **C++**: The project will use C++ (C++17 or higher recommended) as the primary development language. + +### 2.2 Build System + +* **CMake**: This project uses CMake (version 3.15 or higher recommended) as the sole cross-platform build system. +* `CMakeLists.txt` files are responsible for managing all compilation targets, dependency linking, and platform-specific configurations. + +### 2.3 Performance & Dependencies + +* **BLAS/LAPACK Acceleration**: To achieve high-performance matrix and vector operations, this project relies on the BLAS and LAPACK interfaces. +* **Supported Libraries**: + * **Intel MKL (Math Kernel Library)**: The preferred high-performance library, especially on Intel architectures. + * **OpenBLAS**: A high-performance open-source alternative for BLAS/LAPACK with excellent cross-platform compatibility. +* The CMake configuration will provide options (e.g., `USE_MKL=ON` or `USE_OPENBLAS=ON`) to allow users to select the math library to link against at compile time. + +### 2.4 Coding Style + +* **Google C++ Coding Style**: This project strictly adheres to the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). +* **Formatting**: All C++ code submitted must be formatted using `clang-format` (based on the Google style). +* **Linting**: Using `clang-tidy` is recommended to ensure code quality and style consistency. + +## 3. Environment Setup + +Before starting the build, you must ensure the following dependencies are installed on your system: + +1. **C++ Compiler** (e.g., GCC, Clang, MSVC) +2. **CMake** (version >= 3.15) +3. **Intel MKL** or **OpenBLAS** + * For Intel MKL, ensure the `MKLROOT` environment variable is set, or point to its installation path via the CMake `CMAKE_PREFIX_PATH`. + * For OpenBLAS, ensure CMake can find it via `find_package(OpenBLAS)`. diff --git a/main.cc b/main.cc new file mode 100644 index 0000000..463d030 --- /dev/null +++ b/main.cc @@ -0,0 +1,6 @@ +#include + +int main() { + std::cout << "Hello, SNPLIB!" << std::endl; + return 0; +} diff --git a/src/data_manage.cc b/src/data_manage.cc new file mode 100644 index 0000000..0d0e317 --- /dev/null +++ b/src/data_manage.cc @@ -0,0 +1,98 @@ +// data_manage.cc +#include "data_manage.h" + +#include +#include +#include + +#include "snp.h" + +namespace snplib { + +// Unpacks genotype data from a packed format to a double-precision floating-point +// format. +void UnpackGeno(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *geno_d) { + const std::array geno_table{0.0, 0.0, 1.0, 2.0}; + SNP snp(geno, num_samples); + for (size_t i = 0; i < num_snps; ++i) { + auto *snp_geno_d = geno_d + i * num_samples; + snp.UnpackGeno(geno_table, snp_geno_d); + snp += 1; + } +} + +// Unpacks genotype data for GRM calculation, standardizing the genotypes. +void UnpackGRMGeno(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *geno_d) { + std::array geno_table{0.0, 0.0, 0.0, 0.0}; + SNP snp(geno, num_samples); + for (size_t i = 0; i < num_snps; ++i) { + auto mu = 2.0 * af[i]; + auto std = std::sqrt(2.0 * af[i] * (1.0 - af[i])); + geno_table[0] = -mu / std; + geno_table[2] = (1.0 - mu) / std; + geno_table[3] = (2.0 - mu) / std; + auto *snp_geno_d = geno_d + i * num_samples; + snp.UnpackGeno(geno_table, snp_geno_d); + snp += 1; + } +} + +// Unpacks genotype data into a -1, 0, 1 format. +void UnpackUGeno(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *geno_d) { + const std::array geno_table{-1.0, 0.0, 0.0, 1.0}; + SNP snp(geno, num_samples); + for (size_t i = 0; i < num_snps; ++i) { + auto *snp_geno_d = geno_d + i * num_samples; + snp.UnpackGeno(geno_table, snp_geno_d); + snp += 1; + } +} + +// Flips the genotypes of specified SNPs. +void FlipGeno(uint8_t *geno, size_t num_samples, size_t num_snps, + const int32_t *index) { + auto num_full_bytes = num_samples / 4; + auto num_samples_left = num_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t i = 0; i < num_snps; ++i) { + auto *tmp_g = geno + index[i] * num_bytes; + for (size_t j = 0; j < num_bytes; ++j) { + uint8_t g = ~tmp_g[j]; + uint8_t g1 = (g >> 1) & 0x55; + uint8_t g2 = (g << 1) & 0xAA; + tmp_g[j] = g1 + g2; + } + } +} + +// Creates a new genotype dataset by selecting a subset of samples. +void Keep(const uint8_t *src_geno, uint8_t *dest_geno, size_t num_src_samples, + size_t num_dest_samples, size_t num_snps, const int32_t *index) { + SNP src_snp(src_geno, num_src_samples); + auto num_full_bytes = num_dest_samples / 4; + auto num_samples_left = num_dest_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t l = 0; l < num_snps; ++l) { + auto *tmp_g = dest_geno + l * num_bytes; + for (size_t i = 0; i < num_full_bytes; ++i) { + uint8_t t = src_snp(index[4 * i]); + t += src_snp(index[4 * i + 1]) << 2; + t += src_snp(index[4 * i + 2]) << 4; + t += src_snp(index[4 * i + 3]) << 6; + tmp_g[i] = t; + } + if (num_samples_left > 0) { + uint8_t t = 0; + for (size_t i = 0; i < num_samples_left; ++i) { + t += src_snp(index[4 * num_full_bytes + i]) << (2 * i); + } + tmp_g[num_full_bytes] = t; + } + src_snp += 1; + } +} + +} // namespace snplib \ No newline at end of file diff --git a/src/data_manage.h b/src/data_manage.h new file mode 100644 index 0000000..f0af81d --- /dev/null +++ b/src/data_manage.h @@ -0,0 +1,33 @@ +// data_manage.h +#ifndef SNPLIB_SRC_DATA_MANAGE_H_ +#define SNPLIB_SRC_DATA_MANAGE_H_ + +#include +#include + +namespace snplib { + +// Unpacks genotype data from a packed format to a double-precision floating-point +// format. +void UnpackGeno(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *geno_d); + +// Unpacks genotype data for GRM calculation, standardizing the genotypes. +void UnpackGRMGeno(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *geno_d); + +// Unpacks genotype data into a -1, 0, 1 format. +void UnpackUGeno(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *geno_d); + +// Flips the genotypes of specified SNPs. +void FlipGeno(uint8_t *geno, size_t num_samples, size_t num_snps, + const int32_t *index); + +// Creates a new genotype dataset by selecting a subset of samples. +void Keep(const uint8_t *src_geno, uint8_t *dest_geno, size_t num_src_samples, + size_t num_dest_samples, size_t num_snps, const int32_t *index); + +} // namespace snplib + +#endif // SNPLIB_SRC_DATA_MANAGE_H_ diff --git a/src/grm.cc b/src/grm.cc new file mode 100644 index 0000000..7daca18 --- /dev/null +++ b/src/grm.cc @@ -0,0 +1,244 @@ +// grm.cc +#include "grm.h" + +#include +#include +#include +#include + +#include "snp.h" + +namespace snplib { + +namespace { + +const uint64_t kSnpMask = 0x1249124912491249ull; // 0x1249=b0001001001001001 + +// Calculates the SNP correlation table for a given allele frequency. +void CalcSnpCorrelation(double af, std::array &table) { + table[0] = 2.0 * af / (1.0 - af); // 00 00 + table[1] = 0.0; // 00 01 + table[2] = 1.0 / (1.0 - af) - 2; // 00 10 + table[3] = -2.0; // 00 11 + table[4] = 0.5 / af / (1.0 - af) - 2.0; // 10 10 + table[5] = 1.0 / af - 2.0; // 10 11 + table[6] = 2.0 / af - 2.0; // 11 11 + table[7] = 0.0; // 01 XX +} + +// Updates the lookup table for a block of 20 SNPs. +void UpdateLookupTable(const double *af, double *lookup_table) { + std::array, 5> tables; + for (size_t i = 0; i < 4; ++i) { + auto *lkt = lookup_table + i * 32768; + for (size_t j = 0; j < 5; j++) { + CalcSnpCorrelation(af[5 * i + j], tables[j]); + } + for (size_t j = 0; j < 32768; ++j) { + lkt[j] = tables[0][j & 7u] + tables[1][j >> 3 & 7u] + + tables[2][j >> 6 & 7u] + tables[3][j >> 9 & 7u] + + tables[4][j >> 12 & 7u]; + } + } +} + +// Updates the lookup table for the remaining SNPs. +void UpdateLookupTable(const double *af, size_t num_snps_left, + double *lookup_table) { + std::array, 5> tables; + size_t shorts_left = num_snps_left / 5; + size_t snps_remain = num_snps_left % 5; + for (size_t i = 0; i < shorts_left; ++i) { + auto *lkt = lookup_table + i * 32768; + for (size_t j = 0; j < 5; j++) { + CalcSnpCorrelation(af[5 * i + j], tables[j]); + } + for (size_t j = 0; j < 32768; ++j) { + lkt[j] = tables[0][j & 7u] + tables[1][j >> 3 & 7u] + + tables[2][j >> 6 & 7u] + tables[3][j >> 9 & 7u] + + tables[4][j >> 12 & 7u]; + } + } + if (snps_remain != 0u) { + auto *lkt = lookup_table + shorts_left * 32768; + for (size_t j = 0; j < snps_remain; j++) { + CalcSnpCorrelation(af[5 * shorts_left + j], tables[j]); + } + for (size_t j = snps_remain; j < 5; ++j) { + std::fill(tables[j].begin(), tables[j].end(), 0.0); + } + for (size_t j = 0; j < 32768; ++j) { + lkt[j] = tables[0][j & 7u] + tables[1][j >> 3 & 7u] + + tables[2][j >> 6 & 7u] + tables[3][j >> 9 & 7u] + + tables[4][j >> 12 & 7u]; + } + } +} + +// Creates a mask for the genotype data. +void MaskGeno(const uint64_t *geno64, size_t num_samples, uint64_t *mask64) { + for (size_t i = 0; i < num_samples; ++i) { + uint64_t tmp_geno = ~(geno64[i] ^ kSnpMask); + tmp_geno = (tmp_geno & (tmp_geno >> 1)) & kSnpMask; + mask64[i] = tmp_geno * 7ull; + } +} + +// Updates the GRM with the contribution from a block of SNPs. +void UpdateGRM(const uint64_t *geno64, const uint64_t *mask64, + const double *lookup_table, size_t num_samples, + double *matrix) { + for (size_t i = 0; i < num_samples; i++) { + if (mask64[i] == 0u) { + for (size_t j = i; j < num_samples; ++j) { + uint64_t g = (geno64[i] + geno64[j]) | mask64[j]; + matrix[i * num_samples + j] += + lookup_table[g & 0x7fffu] + + lookup_table[32768 + (g >> 16 & 0x7fffu)] + + lookup_table[65536 + (g >> 32 & 0x7fffu)] + + lookup_table[98304 + (g >> 48 & 0x7fffu)]; + } + } else { + for (size_t j = i; j < num_samples; ++j) { + uint64_t g = (geno64[i] + geno64[j]) | (mask64[i] | mask64[j]); + matrix[i * num_samples + j] += + lookup_table[g & 0x7fffu] + + lookup_table[32768 + (g >> 16 & 0x7fffu)] + + lookup_table[65536 + (g >> 32 & 0x7fffu)] + + lookup_table[98304 + (g >> 48 & 0x7fffu)]; + } + } + } +} + +// Thread function for calculating the GRM. +void CalcGRMMatrixThread(const uint8_t *geno, const double *af, + size_t num_samples, size_t num_snps, double *matrix) { + SNP snp(geno, num_samples); + auto num_blocks = num_snps / 20u; + auto num_snps_left = num_snps % 20u; + std::vector geno64(num_samples); + std::vector mask64(num_samples); + std::vector lookup_table(4 * 32768); + std::fill(matrix, matrix + num_samples * num_samples, 0.0); + for (size_t i = 0; i < num_blocks; ++i) { + snp.TransposeGeno(20, geno64.data()); + UpdateLookupTable(af, lookup_table.data()); + MaskGeno(geno64.data(), num_samples, mask64.data()); + UpdateGRM(geno64.data(), mask64.data(), lookup_table.data(), num_samples, matrix); + snp += 20; + af += 20; + } + if (num_snps_left > 0) { + snp.TransposeGeno(num_snps_left, geno64.data()); + UpdateLookupTable(af, num_snps_left, lookup_table.data()); + MaskGeno(geno64.data(), num_samples, mask64.data()); + UpdateGRM(geno64.data(), mask64.data(), lookup_table.data(), num_samples, matrix); + } +} + +// Thread function for calculating the diagonal of the GCTA GRM. +void CalcGCTADiagonalThread(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *diagonal) { + SNP snp(geno, num_samples); + std::vector geno_d(num_samples, 0.0); + std::fill(diagonal, diagonal + num_samples, 0.0); + std::array geno_table{0.0, 0.0, 0.0, 0.0}; + for (size_t i = 0; i < num_snps; ++i) { + auto var = 2.0 * af[i] * (1.0 - af[i]); + geno_table[0] = 2.0 * af[i] / var; + geno_table[3] = (2.0 - 2.0 * af[i]) / var; + snp.UnpackGeno(geno_table, geno_d.data()); + for (size_t j = 0; j < num_samples; ++j) { + diagonal[j] += geno_d[j]; + } + snp += 1; + } +} + +} // namespace + +// Calculates the Genomic Relationship Matrix (GRM) from genotype data. +void CalcGRMMatrix(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *grm, size_t num_threads) { + std::vector workers; + std::vector matrices(num_samples * num_samples * num_threads); + auto num_snps_job = num_snps / num_threads + 1; + auto num_snps_left = num_snps % num_threads; + auto num_full_bytes = num_samples / 4; + auto num_samples_left = num_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t i = 0; i < num_snps_left; ++i) { + workers.emplace_back(CalcGRMMatrixThread, geno, af, num_samples, + num_snps_job, + matrices.data() + i * num_samples * num_samples); + geno += num_snps_job * num_bytes; + af += num_snps_job; + } + --num_snps_job; + for (size_t i = num_snps_left; i < num_threads; ++i) { + workers.emplace_back(CalcGRMMatrixThread, geno, af, num_samples, + num_snps_job, + matrices.data() + i * num_samples * num_samples); + geno += num_snps_job * num_bytes; + af += num_snps_job; + } + for (auto &&iter : workers) { + iter.join(); + } + std::fill(grm, grm + num_samples * num_samples, 0.0); + for (size_t k = 0; k < num_threads; ++k) { + auto *tmp_m = matrices.data() + k * num_samples * num_samples; + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + grm[i * num_samples + j] += tmp_m[i * num_samples + j]; + } + } + } + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + grm[i * num_samples + j] /= num_snps; + grm[j * num_samples + i] = grm[i * num_samples + j]; + } + } +} + +// Calculates the diagonal of the GCTA GRM. +void CalcGCTADiagonal(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *diagonal, size_t num_threads) { + std::vector workers; + std::vector diagonals(num_samples * num_threads); + auto num_snps_job = num_snps / num_threads + 1; + auto num_snps_left = num_snps % num_threads; + auto num_full_bytes = num_samples / 4; + auto num_samples_left = num_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t i = 0; i < num_snps_left; ++i) { + workers.emplace_back(CalcGCTADiagonalThread, geno, af, num_samples, num_snps_job, + diagonals.data() + i * num_samples); + geno += num_snps_job * num_bytes; + af += num_snps_job; + } + --num_snps_job; + for (size_t i = num_snps_left; i < num_threads; ++i) { + workers.emplace_back(CalcGCTADiagonalThread, geno, af, num_samples, num_snps_job, + diagonals.data() + i * num_samples); + geno += num_snps_job * num_bytes; + af += num_snps_job; + } + for (auto &&iter : workers) { + iter.join(); + } + std::fill(diagonal, diagonal + num_samples, 0.0); + for (size_t k = 0; k < num_threads; ++k) { + auto *tmp_d = diagonals.data() + k * num_samples; + for (size_t i = 0; i < num_samples; ++i) { + diagonal[i] += tmp_d[i]; + } + } + for (size_t i = 0; i < num_samples; ++i) { + diagonal[i] /= num_snps; + } +} + +} // namespace snplib diff --git a/src/grm.h b/src/grm.h new file mode 100644 index 0000000..d0ea8d7 --- /dev/null +++ b/src/grm.h @@ -0,0 +1,20 @@ +// grm.h +#ifndef SNPLIB_SRC_GRM_H_ +#define SNPLIB_SRC_GRM_H_ + +#include +#include + +namespace snplib { + +// Calculates the Genomic Relationship Matrix (GRM) from genotype data. +void CalcGRMMatrix(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *grm, size_t num_threads); + +// Calculates the diagonal of the GCTA GRM. +void CalcGCTADiagonal(const uint8_t *geno, const double *af, size_t num_samples, + size_t num_snps, double *diagonal, size_t num_threads); + +} // namespace snplib + +#endif // SNPLIB_SRC_GRM_H_ diff --git a/src/ibs.cc b/src/ibs.cc new file mode 100644 index 0000000..d816970 --- /dev/null +++ b/src/ibs.cc @@ -0,0 +1,245 @@ +// ibs.cc +#include "ibs.h" + +#include +#include +#include +#include + +#include "snp.h" + +namespace snplib { + +namespace { + +const uint64_t kIbsMask = 0x5555555555555555ull; // 0x5555=b0101010101010101 + +// Calculates the number of IBS for a block of 1024 SNPs between two samples. +uint64_t CalcIBSBlock(const __m128i *geno1, const __m128i *mask1, + const __m128i *geno2, const __m128i *mask2) { + uint64_t count = 0; + for (size_t i = 0; i < 16; ++i) { + __m128i g = _mm_xor_si128(_mm_loadu_si128(geno1 + i), _mm_loadu_si128(geno2 + i)); + __m128i m = _mm_and_si128(_mm_loadu_si128(mask1 + i), _mm_loadu_si128(mask2 + i)); + g = _mm_andnot_si128(g, m); + count += _mm_popcnt_u64(_mm_cvtsi128_si64(g)); + g = _mm_srli_si128(g, 8); + count += _mm_popcnt_u64(_mm_cvtsi128_si64(g)); + } + return count; +} + +// Creates a mask for the genotype data. +void MaskGeno(const uint64_t *geno64, size_t num_samples, uint64_t *mask64) { + for (size_t i = 0; i < 32 * num_samples; ++i) { + uint64_t tmp_geno = geno64[i] ^ kIbsMask; + tmp_geno = (tmp_geno | (tmp_geno >> 1)) & kIbsMask; + mask64[i] = tmp_geno * 3ull; + } +} + +// Updates the IBS matrix with the contribution from a block of SNPs. +void UpdateIBSMatrix(const uint64_t *geno64, const uint64_t *mask64, + size_t num_samples, uint64_t *matrix) { + for (size_t i = 0; i < num_samples; ++i) { + auto *g1 = reinterpret_cast(geno64 + 32 * i); + auto *m1 = reinterpret_cast(mask64 + 32 * i); + for (size_t j = i; j < num_samples; ++j) { + auto *g2 = reinterpret_cast(geno64 + 32 * j); + auto *m2 = reinterpret_cast(mask64 + 32 * j); + matrix[i * num_samples + j] += CalcIBSBlock(g1, m1, g2, m2); + } + } +} + +// Updates the IBS connection with the contribution from a block of SNPs. +void UpdateIBSConnection(const uint64_t *src_geno64, const uint64_t *src_mask64, + const uint64_t *dest_geno64, const uint64_t *dest_mask64, + size_t num_src_samples, size_t num_dest_samples, + uint64_t *connect) { + for (size_t i = 0; i < num_dest_samples; ++i) { + auto *g1 = reinterpret_cast(dest_geno64 + 32 * i); + auto *m1 = reinterpret_cast(dest_mask64 + 32 * i); + uint64_t result = 0; + for (size_t j = 0; j < num_src_samples; ++j) { + auto *g2 = reinterpret_cast(src_geno64 + 32 * j); + auto *m2 = reinterpret_cast(src_mask64 + 32 * j); + result += CalcIBSBlock(g1, m1, g2, m2); + } + connect[i] += result; + } +} + +// Thread function for calculating the IBS matrix. +void CalcIBSMatrixThread(const uint8_t *geno, size_t num_samples, + size_t num_snps, uint64_t *matrix) { + SNP snp(geno, num_samples); + auto num_blocks = num_snps / 1024u; + auto num_snps_left = num_snps % 1024u; + std::vector geno64(32 * num_samples); + std::vector mask64(32 * num_samples); + std::fill(matrix, matrix + num_samples * num_samples, 0ull); + for (size_t i = 0; i < num_blocks; ++i) { + for (size_t j = 0; j < 32; ++j) { + snp.TransposeGeno(32, j, geno64.data()); + snp += 32; + } + MaskGeno(geno64.data(), num_samples, mask64.data()); + UpdateIBSMatrix(geno64.data(), mask64.data(), num_samples, matrix); + } + if (num_snps_left > 0) { + num_blocks = num_snps_left / 32; + std::fill(geno64.begin(), geno64.end(), kIbsMask); + for (size_t j = 0; j < num_blocks; ++j) { + snp.TransposeGeno(32, j, geno64.data()); + snp += 32; + } + num_snps_left %= 32; + if (num_snps_left > 0u) { + snp.TransposeGeno(num_snps_left, num_blocks, geno64.data()); + } + MaskGeno(geno64.data(), num_samples, mask64.data()); + UpdateIBSMatrix(geno64.data(), mask64.data(), num_samples, matrix); + } +} + +// Thread function for calculating the IBS connection. +void CalcIBSConnectionThread(const uint8_t *src_geno, size_t num_src_samples, + const uint8_t *dest_geno, size_t num_dest_samples, + size_t num_snps, uint64_t *connection) { + SNP src_snp(src_geno, num_src_samples); + SNP dest_snp(dest_geno, num_dest_samples); + auto num_blocks = num_snps / 1024u; + auto num_snps_left = num_snps % 1024u; + std::vector src_geno64(32 * num_src_samples); + std::vector src_mask64(32 * num_src_samples); + std::vector dest_geno64(32 * num_dest_samples); + std::vector dest_mask64(32 * num_dest_samples); + std::fill(connection, connection + num_dest_samples, 0ull); + for (size_t i = 0; i < num_blocks; ++i) { + for (size_t j = 0; j < 32; ++j) { + src_snp.TransposeGeno(32, j, src_geno64.data()); + dest_snp.TransposeGeno(32, j, dest_geno64.data()); + src_snp += 32; + dest_snp += 32; + } + MaskGeno(src_geno64.data(), num_src_samples, src_mask64.data()); + MaskGeno(dest_geno64.data(), num_dest_samples, dest_mask64.data()); + UpdateIBSConnection(src_geno64.data(), src_mask64.data(), dest_geno64.data(), + dest_mask64.data(), num_src_samples, num_dest_samples, + connection); + } + if (num_snps_left > 0u) { + std::fill(src_geno64.begin(), src_geno64.end(), kIbsMask); + std::fill(dest_geno64.begin(), dest_geno64.end(), kIbsMask); + num_blocks = num_snps_left / 32; + for (size_t j = 0; j < num_blocks; ++j) { + src_snp.TransposeGeno(32, j, src_geno64.data()); + dest_snp.TransposeGeno(32, j, dest_geno64.data()); + src_snp += 32; + dest_snp += 32; + } + num_snps_left %= 32; + if (num_snps_left > 0u) { + src_snp.TransposeGeno(num_snps_left, num_blocks, src_geno64.data()); + dest_snp.TransposeGeno(num_snps_left, num_blocks, dest_geno64.data()); + } + MaskGeno(src_geno64.data(), num_src_samples, src_mask64.data()); + MaskGeno(dest_geno64.data(), num_dest_samples, dest_mask64.data()); + UpdateIBSConnection(src_geno64.data(), src_mask64.data(), dest_geno64.data(), + dest_mask64.data(), num_src_samples, num_dest_samples, + connection); + } +} + +} // namespace + +// Calculates the Identity by State (IBS) matrix from genotype data. +void CalcIBSMatrix(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *matrix, size_t num_threads) { + std::vector workers; + std::vector matrices(num_samples * num_samples * num_threads); + auto num_snps_job = num_snps / num_threads + 1; + auto num_snps_left = num_snps % num_threads; + auto num_full_bytes = num_samples / 4; + auto num_samples_left = num_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t i = 0; i < num_snps_left; ++i) { + workers.emplace_back(CalcIBSMatrixThread, geno, num_samples, num_snps_job, + matrices.data() + i * num_samples * num_samples); + geno += num_snps_job * num_bytes; + } + --num_snps_job; + for (size_t i = num_snps_left; i < num_threads; ++i) { + workers.emplace_back(CalcIBSMatrixThread, geno, num_samples, num_snps_job, + matrices.data() + i * num_samples * num_samples); + geno += num_snps_job * num_bytes; + } + for (auto &&iter : workers) { + iter.join(); + } + auto *matrix_u = matrices.data(); + for (size_t k = 1; k < num_threads; ++k) { + auto *tmp_m = matrices.data() + k * num_samples * num_samples; + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + matrix_u[i * num_samples + j] += tmp_m[i * num_samples + j]; + } + } + } + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + matrix[i * num_samples + j] = + static_cast(matrix_u[i * num_samples + j]) / num_snps / 2.0; + matrix[j * num_samples + i] = matrix[i * num_samples + j]; + } + } +} + +// Calculates the IBS connection between two sets of samples. +void CalcIBSConnection(const uint8_t *src_geno, size_t num_src_samples, + const uint8_t *dest_geno, size_t num_dest_samples, + size_t num_snps, double *connection, + size_t num_threads) { + std::vector workers(num_threads); + std::vector connects(num_dest_samples * num_threads); + auto num_src_full_bytes = num_src_samples / 4; + auto num_src_samples_left = num_src_samples % 4; + auto num_src_bytes = num_src_full_bytes + (num_src_samples_left > 0 ? 1 : 0); + auto num_dest_full_bytes = num_dest_samples / 4; + auto num_dest_samples_left = num_dest_samples % 4; + auto num_dest_bytes = + num_dest_full_bytes + (num_dest_samples_left > 0 ? 1 : 0); + auto num_snps_job = num_snps / num_threads + 1; + auto num_snps_left = num_snps % num_threads; + for (size_t i = 0; i < num_snps_left; ++i) { + workers[i] = std::thread(CalcIBSConnectionThread, src_geno, num_src_samples, + dest_geno, num_dest_samples, num_snps_job, + connects.data() + i * num_dest_samples); + src_geno += num_snps_job * num_src_bytes; + dest_geno += num_snps_job * num_dest_bytes; + } + --num_snps_job; + for (size_t i = num_snps_left; i < num_threads; ++i) { + workers[i] = std::thread(CalcIBSConnectionThread, src_geno, num_src_samples, + dest_geno, num_dest_samples, num_snps_job, + connects.data() + i * num_dest_samples); + src_geno += num_snps_job * num_src_bytes; + dest_geno += num_snps_job * num_dest_bytes; + } + for (auto &&iter : workers) { + iter.join(); + } + auto *connect_u = connects.data(); + for (size_t i = 1; i < num_threads; ++i) { + auto *tmp_c = connects.data() + i * num_dest_samples; + for (size_t j = 0; j < num_dest_samples; ++j) { + connect_u[j] += tmp_c[j]; + } + } + for (size_t i = 0; i < num_dest_samples; ++i) { + connection[i] = static_cast(connect_u[i]) / num_snps / 2.0; + } +} + +} // namespace snplib \ No newline at end of file diff --git a/src/ibs.h b/src/ibs.h new file mode 100644 index 0000000..558cc33 --- /dev/null +++ b/src/ibs.h @@ -0,0 +1,21 @@ +// ibs.h +#ifndef SNPLIB_SRC_IBS_H_ +#define SNPLIB_SRC_IBS_H_ + +#include +#include + +namespace snplib { + +// Calculates the Identity by State (IBS) matrix from genotype data. +void CalcIBSMatrix(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *matrix, size_t num_threads); + +// Calculates the IBS connection between two sets of samples. +void CalcIBSConnection(const uint8_t *src_geno, size_t num_src_samples, + const uint8_t *dest_geno, size_t num_dest_samples, + size_t num_snps, double *connection, size_t num_threads); + +} // namespace snplib + +#endif // SNPLIB_SRC_IBS_H_ diff --git a/src/king.cc b/src/king.cc new file mode 100644 index 0000000..548d0e7 --- /dev/null +++ b/src/king.cc @@ -0,0 +1,205 @@ +// king.cc +#include "king.h" + +#include +#include +#include +#include +#include +#include + +#include "snp.h" + +namespace snplib { + +namespace { + +const uint64_t kHomozygousMask = 0x5555555555555555ull; // 0x5555=b0101010101010101 +const uint64_t kHeterozygousMask = 0xAAAAAAAAAAAAAAAAull; // 0xAAAA=b1010101010101010 + +// Calculates the KING-robust kinship coefficient for a block of 1024 SNPs. +int64_t CalcKINGBlock(const uint64_t *geno1, const uint64_t *geno2) { + int64_t num_hetero = 0; + int64_t num_homo = 0; + for (size_t i = 0; i < 32; ++i) { + auto mask_1 = geno1[i] ^ kHomozygousMask; + mask_1 = (mask_1 | (mask_1 >> 1)) & kHomozygousMask; + mask_1 *= 3ull; + auto mask_2 = geno2[i] ^ kHomozygousMask; + mask_2 = (mask_2 | (mask_2 >> 1)) & kHomozygousMask; + mask_2 *= 3ull; + auto hetero_1 = geno1[i] ^ kHeterozygousMask; + hetero_1 = (hetero_1 | (hetero_1 << 1)) >> 1; + hetero_1 = (hetero_1 & kHomozygousMask) * 3ull; + auto hetero_2 = geno2[i] ^ kHeterozygousMask; + hetero_2 = (hetero_2 | (hetero_2 << 1)) >> 1; + hetero_2 = (hetero_2 & kHomozygousMask) * 3ull; + num_hetero += _mm_popcnt_u64(~(hetero_1 | hetero_2)); + auto geno = geno1[i] ^ geno2[i]; + geno &= mask_1 & mask_2 & hetero_1 & hetero_2; + num_homo += _mm_popcnt_u64(geno); + } + return num_hetero - 2 * num_homo; +} + +// Calculates the number of heterozygous SNPs for a sample. +uint64_t CalcHeteroCount(const uint64_t *geno) { + uint64_t num_hetero = 0; + for (size_t i = 0; i < 32; ++i) { + auto mask = geno[i] ^ kHomozygousMask; + mask = (mask | (mask >> 1)) & kHomozygousMask; + mask *= 3ull; + auto hetero = geno[i] ^ kHeterozygousMask; + hetero = (hetero | (hetero << 1)) >> 1; + hetero = (hetero & mask) * 3ull; + num_hetero += _mm_popcnt_u64(~hetero); + } + return num_hetero; +} + +// Updates the KING matrix with the contribution from a block of SNPs. +void UpdateKINGMatrix(const uint64_t *geno64, size_t num_samples, int64_t *matrix) { + for (size_t i = 0; i < num_samples; ++i) { + auto *g1 = geno64 + 32 * i; + for (size_t j = i + 1; j < num_samples; ++j) { + auto *g2 = geno64 + 32 * j; + matrix[i * num_samples + j] += CalcKINGBlock(g1, g2); + } + } +} + +// Updates the heterozygous count vector with the contribution from a block of +// SNPs. +void UpdateHeteroCount(const uint64_t *geno64, size_t num_samples, + uint64_t *vector) { + for (size_t i = 0; i < num_samples; ++i) { + auto *g1 = geno64 + 32 * i; + vector[i] += CalcHeteroCount(g1); + } +} + +// Thread function for calculating the KING matrix. +void CalcKINGMatrixThread(const uint8_t *geno, size_t num_samples, size_t num_snps, + int64_t *matrix, uint64_t *vector) { + SNP snp(geno, num_samples); + auto num_blocks = num_snps / 1024u; + auto num_snps_left = num_snps % 1024u; + std::vector geno64(32 * num_samples); + std::fill(matrix, matrix + num_samples * num_samples, 0ull); + std::fill(vector, vector + num_samples, 0ull); + for (size_t i = 0; i < num_blocks; ++i) { + for (size_t j = 0; j < 32; ++j) { + snp.TransposeGeno(32, j, geno64.data()); + snp += 32; + } + UpdateHeteroCount(geno64.data(), num_samples, vector); + UpdateKINGMatrix(geno64.data(), num_samples, matrix); + } + if (num_snps_left > 0) { + num_blocks = num_snps_left / 32; + std::fill(geno64.begin(), geno64.end(), kHomozygousMask); + for (size_t j = 0; j < num_blocks; ++j) { + snp.TransposeGeno(32, j, geno64.data()); + snp += 32; + } + num_snps_left %= 32; + if (num_snps_left > 0u) { + snp.TransposeGeno(num_snps_left, num_blocks, geno64.data()); + } + UpdateHeteroCount(geno64.data(), num_samples, vector); + UpdateKINGMatrix(geno64.data(), num_samples, matrix); + } +} + +} // namespace + +// Calculates the KING-robust kinship coefficient matrix. +void CalcKINGMatrix(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *matrix, size_t num_threads) { + std::vector workers; + std::vector matrices(num_samples * num_samples * num_threads); + std::vector vectors(num_samples * num_threads); + auto num_snps_job = num_snps / num_threads + 1; + auto num_snps_left = num_snps % num_threads; + auto num_full_bytes = num_samples / 4; + auto num_samples_left = num_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t i = 0; i < num_snps_left; ++i) { + workers.emplace_back(CalcKINGMatrixThread, geno, num_samples, num_snps_job, + matrices.data() + i * num_samples * num_samples, + vectors.data() + i * num_samples); + geno += num_snps_job * num_bytes; + } + --num_snps_job; + for (size_t i = num_snps_left; i < num_threads; ++i) { + workers.emplace_back(CalcKINGMatrixThread, geno, num_samples, num_snps_job, + matrices.data() + i * num_samples * num_samples, + vectors.data() + i * num_samples); + geno += num_snps_job * num_bytes; + } + for (auto &&iter : workers) { + iter.join(); + } + auto *matrix_u = matrices.data(); + for (size_t k = 1; k < num_threads; ++k) { + auto *tmp_m = matrices.data() + k * num_samples * num_samples; + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + matrix_u[i * num_samples + j] += tmp_m[i * num_samples + j]; + } + } + } + auto *vector_u = vectors.data(); + for (size_t k = 1; k < num_threads; ++k) { + auto *tmp_v = vectors.data() + k * num_samples; + for (size_t i = 0; i < num_samples; ++i) { + vector_u[i] += tmp_v[i]; + } + } + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i + 1; j < num_samples; ++j) { + matrix[i * num_samples + j] = + static_cast(matrix_u[i * num_samples + j]) / + (vector_u[i] + vector_u[j]); + matrix[j * num_samples + i] = matrix[i * num_samples + j]; + } + } +} + +// Finds a group of unrelated individuals from a kinship matrix. +std::vector FindUnrelatedGroup(const double *matrix, + size_t num_samples, double threshold) { + std::vector R(num_samples * num_samples, 0); + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i + 1; j < num_samples; ++j) { + R[i * num_samples + j] = matrix[i * num_samples + j] > threshold ? 1 : 0; + R[j * num_samples + i] = R[i * num_samples + j]; + } + } + std::vector r(num_samples); + for (size_t i = 0; i < num_samples; ++i) { + r[i] = std::accumulate(R.data() + i * num_samples, R.data() + (i + 1) * num_samples, 0); + } + std::vector I; + while (std::any_of(r.begin(), r.end(), [](int a) { return a > 0; })) { + auto idx = std::max_element(r.begin(), r.end()) - r.begin(); + for (size_t i = 0; i < num_samples; ++i) { + R[idx * num_samples + i] = 0; + R[i * num_samples + idx] = 0; + } + for (size_t i = 0; i < num_samples; ++i) { + r[i] = std::accumulate(R.data() + i * num_samples, R.data() + (i + 1) * num_samples, 0); + } + I.emplace_back(idx); + } + std::vector result; + for (int i = 0; i < num_samples; ++i) { + auto iter = std::find(I.begin(), I.end(), i); + if (iter == I.end()) { + result.emplace_back(i); + } + } + return result; +} + +} // namespace snplib \ No newline at end of file diff --git a/src/king.h b/src/king.h new file mode 100644 index 0000000..467f92a --- /dev/null +++ b/src/king.h @@ -0,0 +1,21 @@ +// king.h +#ifndef SNPLIB_SRC_KING_H_ +#define SNPLIB_SRC_KING_H_ + +#include +#include +#include + +namespace snplib { + +// Calculates the KING-robust kinship coefficient matrix. +void CalcKINGMatrix(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *matrix, size_t num_threads); + +// Finds a group of unrelated individuals from a kinship matrix. +std::vector FindUnrelatedGroup(const double *matrix, + size_t num_samples, double threshold); + +} // namespace snplib + +#endif // SNPLIB_SRC_KING_H_ diff --git a/src/snp.cc b/src/snp.cc new file mode 100644 index 0000000..5d94fc3 --- /dev/null +++ b/src/snp.cc @@ -0,0 +1,134 @@ +// snp.cc +#include "snp.h" + +#include +#include + +namespace snplib { + +namespace { +// Constants for genotype data conversion. +const uint64_t kUgMask = 0x0303030303030303ull; // 0303 = 0000001100000011 +const uint64_t kSgMask = 0x0003000300030003ull; // 0003 = 0000000000000011 + +// Converts a 4x64-bit genotype block to a single 64-bit value. +uint64_t ConvertGenoTo64(const std::array &g64) { + uint64_t geno64; + geno64 = g64[3] & kUgMask; + geno64 <<= 2; + geno64 |= g64[2] & kUgMask; + geno64 <<= 2; + geno64 |= g64[1] & kUgMask; + geno64 <<= 2; + geno64 |= g64[0] & kUgMask; + return geno64; +} + +// Converts a 5x64-bit genotype block to a single 64-bit value. +uint64_t ConvertGenoTo64(const std::array &g64) { + uint64_t geno64; + geno64 = g64[4] & kSgMask; + geno64 <<= 3; + geno64 |= g64[3] & kSgMask; + geno64 <<= 3; + geno64 |= g64[2] & kSgMask; + geno64 <<= 3; + geno64 |= g64[1] & kSgMask; + geno64 <<= 3; + geno64 |= g64[0] & kSgMask; + return geno64; +} + +} // namespace + +SNP::SNP(const uint8_t *geno, size_t num_samples) + : geno_(geno), + num_samples_(num_samples), + num_full_bytes_(num_samples / 4), + num_samples_left_(num_samples_ % 4), + num_bytes_(num_full_bytes_ + (num_samples_left_ != 0 ? 1 : 0)) {} + +void SNP::ConvertGeno(size_t num_snps, size_t idx, + std::array &g64) { + uint64_t g8[32] = {0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull}; + for (size_t i = 0; i < num_snps; ++i) { + const auto *tmp_g = geno_ + i * num_bytes_; + g8[i] = static_cast(tmp_g[idx]); + } + for (size_t i = 0; i < 4; ++i) { + g64[i] = g8[i] + (g8[4 + i] << 8) + (g8[8 + i] << 16) + (g8[12 + i] << 24) + + (g8[16 + i] << 32) + (g8[20 + i] << 40) + (g8[24 + i] << 48) + + (g8[28 + i] << 56); + } +} + +void SNP::ConvertGeno(size_t num_snps, size_t idx, + std::array &g64) { + uint64_t g8[20] = {0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull, + 0x55ull, 0x55ull, 0x55ull, 0x55ull, 0x55ull}; + for (size_t i = 0; i < num_snps; ++i) { + const auto *tmp_g = geno_ + i * num_bytes_; + g8[i] = static_cast(tmp_g[idx]); + } + for (size_t i = 0; i < 5; ++i) { + g64[i] = + g8[i] + (g8[5 + i] << 16) + (g8[10 + i] << 32) + (g8[15 + i] << 48); + } +} + +void SNP::TransposeGeno(size_t num_snps, size_t idx, uint64_t *geno64) { + std::array g64; + for (size_t i = 0; i < num_full_bytes_; ++i) { + ConvertGeno(num_snps, i, g64); + geno64[32 * (4 * i) + idx] = ConvertGenoTo64(g64); + for (size_t j = 1; j < 4; ++j) { + for (auto &iter : g64) { + iter >>= 2; + } + geno64[32 * (4 * i + j) + idx] = ConvertGenoTo64(g64); + } + } + if (num_samples_left_ > 0) { + ConvertGeno(num_snps, num_full_bytes_, g64); + geno64[32 * (4 * num_full_bytes_) + idx] = ConvertGenoTo64(g64); + for (size_t j = 1; j < num_samples_left_; ++j) { + for (auto &iter : g64) { + iter >>= 2; + } + geno64[32 * (4 * num_full_bytes_ + j) + idx] = ConvertGenoTo64(g64); + } + } +} + +void SNP::TransposeGeno(size_t num_snps, uint64_t *geno64) { + std::array g64; + for (size_t i = 0; i < num_full_bytes_; ++i) { + ConvertGeno(num_snps, i, g64); + geno64[4 * i] = ConvertGenoTo64(g64); + for (size_t j = 1; j < 4; ++j) { + for (auto &iter : g64) { + iter >>= 2; + } + geno64[4 * i + j] = ConvertGenoTo64(g64); + } + } + if (num_samples_left_ > 0) { + ConvertGeno(num_snps, num_full_bytes_, g64); + geno64[4 * num_full_bytes_] = ConvertGenoTo64(g64); + for (size_t j = 1; j < num_samples_left_; ++j) { + for (auto &iter : g64) { + iter >>= 2; + } + geno64[4 * num_full_bytes_ + j] = ConvertGenoTo64(g64); + } + } +} + +} // namespace snplib \ No newline at end of file diff --git a/src/snp.h b/src/snp.h new file mode 100644 index 0000000..da4400e --- /dev/null +++ b/src/snp.h @@ -0,0 +1,93 @@ +// snp.h +#ifndef SNPLIB_SRC_SNP_H_ +#define SNPLIB_SRC_SNP_H_ + +#include +#include +#include + +namespace snplib { + +// The SNP class provides a view of a single SNP's genotype data. +// It is designed to work with packed genotype data where each sample's genotype +// is stored in 2 bits. The class provides methods for unpacking the genotype +// data, transposing it, and performing other operations. +class SNP { + public: + // Constructor for the SNP class. + // + // @param geno A pointer to the packed genotype data. + // @param num_samples The number of samples. + SNP(const uint8_t *geno, size_t num_samples); + + SNP(const SNP &) = default; + SNP(SNP &&) = default; + SNP &operator=(const SNP &) = delete; + SNP &operator=(SNP &&) = delete; + ~SNP() = default; + + // Returns the genotype value for a given sample index. + // + // @param idx The index of the sample. + // @return The genotype value (0, 1, 2, or 3). + uint8_t operator()(size_t idx) const; + + // Advances the SNP pointer to the next SNP. + // + // @param idx The number of SNPs to advance. + // @return A reference to the current SNP object. + SNP &operator+=(size_t idx); + + // Copies the SNP's genotype data to a destination buffer. + // + // @param dest A pointer to the destination buffer. + template + void Copy(T *dest) const { + memcpy(static_cast(dest), static_cast(geno_), + sizeof(uint8_t) * num_bytes_); + } + + // Unpacks the genotype data into a destination buffer. + // + // @param geno_table A lookup table for converting genotype values. + // @param geno A pointer to the destination buffer. + template + void UnpackGeno(const std::array &geno_table, T *geno); + + // Unpacks the genotype data into two destination buffers (genotype and mask). + // + // @param geno_table A lookup table for converting genotype values. + // @param mask_table A lookup table for creating the mask. + // @param geno A pointer to the genotype destination buffer. + // @param mask A pointer to the mask destination buffer. + template + void UnpackGeno(const std::array &geno_table, + const std::array &mask_table, T *geno, T *mask); + + // Transposes the genotype data for a block of SNPs. + // + // @param num_snps The number of SNPs to transpose. + // @param idx The index of the current SNP within the block. + // @param geno64 A pointer to the destination buffer. + void TransposeGeno(size_t num_snps, size_t idx, uint64_t *geno64); + + // Transposes the genotype data for a block of SNPs. + // + // @param num_snps The number of SNPs to transpose. + // @param geno64 A pointer to the destination buffer. + void TransposeGeno(size_t num_snps, uint64_t *geno64); + + private: + const uint8_t *geno_ = nullptr; + size_t num_samples_ = 0; + size_t num_full_bytes_ = 0; + size_t num_samples_left_ = 0; + size_t num_bytes_ = 0; + + void ConvertGeno(size_t num_snps, size_t idx, std::array &g64); + void ConvertGeno(size_t num_snps, size_t idx, std::array &g64); +}; + +} // namespace snplib + +#endif // SNPLIB_SRC_SNP_H_ diff --git a/src/statistics.cc b/src/statistics.cc new file mode 100644 index 0000000..542e3ab --- /dev/null +++ b/src/statistics.cc @@ -0,0 +1,188 @@ +// statistics.cc +#include "statistics.h" + +#include +#include +#include +#include +#include + +#include "snp.h" + +namespace snplib { + +namespace { + +std::atomic_size_t global_snpc_index; + +const uint64_t kHomozygousMask = 0x5555555555555555ull; // 0x5555=b0101010101010101 +const uint64_t kHeterozygousMask = 0xAAAAAAAAAAAAAAAAull; // 0xAAAA=b1010101010101010 + +// Calculates the LD R^2 between two SNPs. +double CalcLDR2(const uint64_t *geno_1, const uint64_t *geno_2, double af1, + double af2, size_t num_words) { + uint64_t num_samples = 0; + uint64_t n11 = 0; + uint64_t n12 = 0; + uint64_t n21 = 0; + uint64_t n22 = 0; + uint64_t n_homo_1 = 0; + uint64_t n_homo_2 = 0; + uint64_t n_hetero_1 = 0; + uint64_t n_hetero_2 = 0; + for (size_t i = 0; i < num_words; ++i) { + uint64_t mask_1 = geno_1[i] ^ kHomozygousMask; + mask_1 = (mask_1 | (mask_1 >> 1)) & kHomozygousMask; + uint64_t mask_2 = geno_2[i] ^ kHomozygousMask; + mask_2 = (mask_2 | (mask_2 >> 1)) & kHomozygousMask; + auto mask = mask_1 & mask_2; + mask *= 3ull; + uint64_t homo_1 = ((geno_1[i] ^ kHeterozygousMask) & kHomozygousMask) & mask; + uint64_t homo_2 = ((geno_2[i] ^ kHeterozygousMask) & kHomozygousMask) & mask; + uint64_t hetero_1 = geno_1[i] ^ kHomozygousMask; + hetero_1 = ((hetero_1 & (hetero_1 >> 1)) & kHomozygousMask) & mask; + uint64_t hetero_2 = geno_2[i] ^ kHomozygousMask; + hetero_2 = ((hetero_2 & (hetero_2 >> 1)) & kHomozygousMask) & mask; + num_samples += _mm_popcnt_u64(mask); + mask = homo_1 & homo_2; + n11 += _mm_popcnt_u64(mask); + mask = homo_1 & hetero_2; + n12 += _mm_popcnt_u64(mask); + mask = hetero_1 & homo_2; + n21 += _mm_popcnt_u64(mask); + mask = hetero_1 & hetero_2; + n22 += _mm_popcnt_u64(mask); + n_homo_1 += _mm_popcnt_u64(homo_1); + n_homo_2 += _mm_popcnt_u64(homo_2); + n_hetero_1 += _mm_popcnt_u64(hetero_1); + n_hetero_2 += _mm_popcnt_u64(hetero_2); + } + num_samples /= 2; + auto n13 = n_homo_1 - n11 - n12; + auto n23 = n_hetero_1 - n21 - n22; + auto n31 = n_homo_2 - n11 - n21; + auto n32 = n_hetero_2 - n12 - n22; + auto n33 = num_samples - n11 - n12 - n13 - n21 - n22 - n23 - n31 - n32; + auto r2 = static_cast(n11) * (2.0 - 2.0 * af1) * (2.0 - 2.0 * af2); + r2 += static_cast(n21) * (1.0 - 2.0 * af1) * (2.0 - 2.0 * af2); + r2 -= static_cast(n31) * 2.0 * af1 * (2.0 - 2.0 * af2); + r2 += static_cast(n12) * (2.0 - 2.0 * af1) * (1.0 - 2.0 * af2); + r2 += static_cast(n22) * (1.0 - 2.0 * af1) * (1.0 - 2.0 * af2); + r2 -= static_cast(n32) * 2.0 * af1 * (1.0 - 2.0 * af2); + r2 -= static_cast(n13) * (2.0 - 2.0 * af1) * 2.0 * af2; + r2 -= static_cast(n23) * (1.0 - 2.0 * af1) * 2.0 * af2; + r2 += static_cast(n33) * 2.0 * af1 * 2.0 * af2; + r2 /= 2.0 * std::sqrt(af1 * af2 * (1.0 - af1) * (1.0 - af2)); + r2 /= (num_samples - 1); + return r2 * r2; +} + +// Thread function for calculating LD scores. +void CalcLDScoresThread(const uint8_t *geno, const int32_t *bp, + const double *af, size_t num_snps, size_t num_samples, + size_t window_size, double r2_threshold, double *ldcv) { + auto local_ind = global_snpc_index++; + auto ws = static_cast(window_size / 2); + auto num_words = num_samples / 32 + (num_samples % 32 > 0 ? 1 : 0); + std::vector geno_1(num_words, kHomozygousMask); + std::vector geno_2(num_words, kHomozygousMask); + while (local_ind < num_snps) { + SNP snp1(geno, num_samples); + snp1 += local_ind; + snp1.Copy(geno_1.data()); + auto lower = bp[local_ind] - ws; + lower = lower > 0 ? lower : 0; + auto upper = bp[local_ind] + ws; + size_t first_ind = 0; + while (bp[first_ind] < lower) { + first_ind++; + } + size_t last_ind = first_ind; + while ((bp[last_ind] <= upper) && (last_ind < num_snps)) { + last_ind++; + } + double cv = 0.0; + for (size_t i = first_ind; i < last_ind; ++i) { + SNP snp2(geno, num_samples); + snp2 += i; + snp2.Copy(geno_2.data()); + auto r2 = CalcLDR2(geno_1.data(), geno_2.data(), af[local_ind], af[i], num_words); + cv += r2 > r2_threshold ? r2 : 0; + } + ldcv[local_ind] = cv; + local_ind = global_snpc_index++; + } +} + +} // namespace + +// Calculates the allele frequencies for a set of SNPs. +void CalcAlleleFrequencies(const uint8_t *geno, size_t num_samples, + size_t num_snps, double *af) { + auto num_words_snp = num_samples / 32 + ((num_samples % 32) != 0u ? 1 : 0); + auto num_bytes = num_samples / 4 + ((num_samples % 4) != 0 ? 1 : 0); + auto correct = 4 * num_bytes - num_samples; + correct *= 2; + std::vector geno64(num_words_snp, kHomozygousMask); + for (size_t i = 0; i < num_snps; ++i) { + memcpy(static_cast(geno64.data()), + static_cast(geno + i * num_bytes), + sizeof(uint8_t) * num_bytes); + uint64_t alleles = 0; + uint64_t nonmissings = 0; + for (size_t j = 0; j < num_words_snp; ++j) { + uint64_t tmp_g = geno64[j] ^ kHomozygousMask; + uint64_t tmp_m = (tmp_g | (tmp_g >> 1)) & kHomozygousMask; + tmp_m *= 3; + nonmissings += _mm_popcnt_u64(tmp_m); + tmp_g = tmp_m & geno64[j]; + alleles += _mm_popcnt_u64(tmp_g); + } + nonmissings -= correct; + af[i] = static_cast(alleles) / nonmissings; + } +} + +// Calculates the missing rate for a set of SNPs. +void CalcMissingRates(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *missing) { + auto num_words_snp = num_samples / 32 + ((num_samples % 32) != 0u ? 1 : 0); + auto num_bytes = num_samples / 4 + ((num_samples % 4) != 0 ? 1 : 0); + auto correct = 4 * num_bytes - num_samples; + correct *= 2; + std::vector geno64(num_words_snp, kHomozygousMask); + for (size_t i = 0; i < num_snps; ++i) { + memcpy(static_cast(geno64.data()), + static_cast(geno + i * num_bytes), + sizeof(uint8_t) * num_bytes); + uint64_t alleles = 0; + uint64_t nonmissings = 0; + for (size_t j = 0; j < num_words_snp; ++j) { + uint64_t tmp_g = geno64[j] ^ kHomozygousMask; + uint64_t tmp_m = (tmp_g | (tmp_g >> 1)) & kHomozygousMask; + tmp_m *= 3; + nonmissings += _mm_popcnt_u64(tmp_m); + tmp_g = tmp_m & geno64[j]; + alleles += _mm_popcnt_u64(tmp_g); + } + nonmissings -= correct; + missing[i] = 1.0 - static_cast(nonmissings) / num_samples; + } +} + +// Calculates the LD scores for a set of SNPs. +void CalcLDScores(const uint8_t *geno, const int32_t *bp, const double *af, + size_t num_snps, size_t num_samples, size_t window_size, + double r2_threshold, double *ldcv, size_t num_threads) { + std::vector workers; + global_snpc_index = 0; + for (size_t i = 0; i < num_threads; ++i) { + workers.emplace_back(CalcLDScoresThread, geno, bp, af, num_snps, + num_samples, window_size, r2_threshold, ldcv); + } + for (auto &&iter : workers) { + iter.join(); + } +} + +} // namespace snplib \ No newline at end of file diff --git a/src/statistics.h b/src/statistics.h new file mode 100644 index 0000000..b2d9d2c --- /dev/null +++ b/src/statistics.h @@ -0,0 +1,25 @@ +// statistics.h +#ifndef SNPLIB_SRC_STATISTICS_H_ +#define SNPLIB_SRC_STATISTICS_H_ + +#include +#include + +namespace snplib { + +// Calculates the allele frequencies for a set of SNPs. +void CalcAlleleFrequencies(const uint8_t *geno, size_t num_samples, + size_t num_snps, double *af); + +// Calculates the missing rate for a set of SNPs. +void CalcMissingRates(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *missing); + +// Calculates the LD scores for a set of SNPs. +void CalcLDScores(const uint8_t *geno, const int32_t *bp, const double *af, + size_t num_snps, size_t num_samples, size_t window_size, + double r2_threshold, double *ldcv, size_t num_threads); + +} // namespace snplib + +#endif // SNPLIB_SRC_STATISTICS_H_ \ No newline at end of file diff --git a/src/ugrm.cc b/src/ugrm.cc new file mode 100644 index 0000000..9c565b3 --- /dev/null +++ b/src/ugrm.cc @@ -0,0 +1,142 @@ +// ugrm.cc +#include "ugrm.h" + +#include +#include +#include +#include + +#include "snp.h" + +namespace snplib { + +namespace { + +const uint64_t kHomozygousMask = 0x5555555555555555ull; // 0x5555=b0101010101010101 +const uint64_t kHeterozygousMask = 0xAAAAAAAAAAAAAAAAull; // 0xAAAA=b1010101010101010 + +// Calculates the U-GRM for a block of 1024 SNPs between two samples. +int64_t CalcUGRMBlock(const __m128i *geno1, const __m128i *mask1, + const __m128i *geno2, const __m128i *mask2) noexcept { + int64_t count_same = 0; + int64_t count_diff = 0; + for (size_t i = 0; i < 16; ++i) { + __m128i g = _mm_xor_si128(_mm_loadu_si128(geno1 + i), _mm_loadu_si128(geno2 + i)); + __m128i m = _mm_and_si128(_mm_loadu_si128(mask1 + i), _mm_loadu_si128(mask2 + i)); + __m128i g_same = _mm_andnot_si128(g, m); + __m128i g_diff = _mm_and_si128(g, m); + count_same += _mm_popcnt_u64(_mm_cvtsi128_si64(g_same)); + count_diff += _mm_popcnt_u64(_mm_cvtsi128_si64(g_diff)); + g_same = _mm_srli_si128(g_same, 8); + g_diff = _mm_srli_si128(g_diff, 8); + count_same += _mm_popcnt_u64(_mm_cvtsi128_si64(g_same)); + count_diff += _mm_popcnt_u64(_mm_cvtsi128_si64(g_diff)); + } + return count_same - count_diff; +} + +// Creates a mask for the genotype data. +void MaskGeno(const uint64_t *geno64, size_t num_samples, uint64_t *mask64) { + for (size_t i = 0; i < 32 * num_samples; ++i) { + uint64_t tmp_geno = geno64[i] ^ kHomozygousMask; + tmp_geno = (tmp_geno | (tmp_geno >> 1)) & kHomozygousMask; + auto mask_1 = tmp_geno * 3ull; + tmp_geno = geno64[i] ^ kHeterozygousMask; + tmp_geno = (tmp_geno | (tmp_geno << 1)) >> 1; + auto mask_2 = (tmp_geno & kHomozygousMask) * 3ull; + mask64[i] = mask_1 & mask_2; + } +} + +// Updates the U-GRM matrix with the contribution from a block of SNPs. +void UpdateUGRMMatrix(const uint64_t *geno64, const uint64_t *mask64, + size_t num_samples, int64_t *matrix) { + for (size_t i = 0; i < num_samples; ++i) { + auto *g1 = reinterpret_cast(geno64 + 32 * i); + auto *m1 = reinterpret_cast(mask64 + 32 * i); + for (size_t j = i; j < num_samples; ++j) { + auto *g2 = reinterpret_cast(geno64 + 32 * j); + auto *m2 = reinterpret_cast(mask64 + 32 * j); + matrix[i * num_samples + j] += CalcUGRMBlock(g1, m1, g2, m2); + } + } +} + +// Thread function for calculating the U-GRM matrix. +void CalcUGRMMatrixThread(const uint8_t *geno, size_t num_samples, + size_t num_snps, int64_t *matrix) { + SNP snp(geno, num_samples); + auto num_blocks = num_snps / 1024u; + auto num_snps_left = num_snps % 1024u; + std::vector geno64(32 * num_samples); + std::vector mask64(32 * num_samples); + std::fill(matrix, matrix + num_samples * num_samples, 0ull); + for (size_t i = 0; i < num_blocks; ++i) { + for (size_t j = 0; j < 32; ++j) { + snp.TransposeGeno(32, j, geno64.data()); + snp += 32; + } + MaskGeno(geno64.data(), num_samples, mask64.data()); + UpdateUGRMMatrix(geno64.data(), mask64.data(), num_samples, matrix); + } + if (num_snps_left > 0) { + num_blocks = num_snps_left / 32; + std::fill(geno64.begin(), geno64.end(), kHomozygousMask); + for (size_t j = 0; j < num_blocks; ++j) { + snp.TransposeGeno(32, j, geno64.data()); + snp += 32; + } + num_snps_left %= 32; + if (num_snps_left > 0u) { + snp.TransposeGeno(num_snps_left, num_blocks, geno64.data()); + } + MaskGeno(geno64.data(), num_samples, mask64.data()); + UpdateUGRMMatrix(geno64.data(), mask64.data(), num_samples, matrix); + } +} + +} // namespace + +// Calculates the U-GRM (Genomic Relationship Matrix) from genotype data. +void CalcUGRMMatrix(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *matrix, size_t num_threads) { + std::vector workers; + std::vector matrices(num_samples * num_samples * num_threads); + auto num_snps_job = num_snps / num_threads + 1; + auto num_snps_left = num_snps % num_threads; + auto num_full_bytes = num_samples / 4; + auto num_samples_left = num_samples % 4; + auto num_bytes = num_full_bytes + (num_samples_left > 0 ? 1 : 0); + for (size_t i = 0; i < num_snps_left; ++i) { + workers.emplace_back(CalcUGRMMatrixThread, geno, num_samples, num_snps_job, + matrices.data() + i * num_samples * num_samples); + geno += num_snps_job * num_bytes; + } + --num_snps_job; + for (size_t i = num_snps_left; i < num_threads; ++i) { + workers.emplace_back(CalcUGRMMatrixThread, geno, num_samples, num_snps_job, + matrices.data() + i * num_samples * num_samples); + geno += num_snps_job * num_bytes; + } + for (auto &&iter : workers) { + iter.join(); + } + auto *matrix_u = matrices.data(); + for (size_t k = 1; k < num_threads; ++k) { + auto *tmp_m = matrices.data() + k * num_samples * num_samples; + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + matrix_u[i * num_samples + j] += tmp_m[i * num_samples + j]; + } + } + } + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = i; j < num_samples; ++j) { + matrix[i * num_samples + j] = + static_cast(matrix_u[i * num_samples + j]) / num_snps / 2.0; + matrix[j * num_samples + i] = matrix[i * num_samples + j]; + } + } +} + +} // namespace snplib \ No newline at end of file diff --git a/src/ugrm.h b/src/ugrm.h new file mode 100644 index 0000000..4ff43be --- /dev/null +++ b/src/ugrm.h @@ -0,0 +1,16 @@ +// ugrm.h +#ifndef SNPLIB_SRC_UGRM_H_ +#define SNPLIB_SRC_UGRM_H_ + +#include +#include + +namespace snplib { + +// Calculates the U-GRM (Genomic Relationship Matrix) from genotype data. +void CalcUGRMMatrix(const uint8_t *geno, size_t num_samples, size_t num_snps, + double *matrix, size_t num_threads); + +} // namespace snplib + +#endif // SNPLIB_SRC_UGRM_H_