Fix data types when declaring / passing data to Cython.

shz9 · Apr 4, 2024 · 1a964f9 · 1a964f9
1 parent 54755ff
commit 1a964f9
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 15 deletions.
diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py
@@ -633,8 +633,8 @@ def from_file(cls, file_path, temp_dir='temp', **kwargs):
         snp_table = snp_table.astype({
             'CHR': int,
             'SNP': str,
-            'cM': float,
-            'POS': int,
+            'cM': np.float32,
+            'POS': np.int32,
             'A1': str,
             'A2': str
         })

diff --git a/magenpy/parsers/misc_parsers.py b/magenpy/parsers/misc_parsers.py
@@ -57,7 +57,7 @@ def parse_ld_block_data(ldb_file_path):
     df = pd.read_csv(ldb_file_path, sep=r'\s+')
 
     df = df.loc[(df.start != 'None') & (df.stop != 'None')]
-    df = df.astype({'chr': str, 'start': np.int64, 'stop': np.int64})
+    df = df.astype({'chr': str, 'start': np.int32, 'stop': np.int32})
     df = df.sort_values('start')
 
     if df.isnull().values.any():

diff --git a/magenpy/parsers/plink_parsers.py b/magenpy/parsers/plink_parsers.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 
 
 def parse_bim_file(plink_bfile):
@@ -31,8 +32,8 @@ def parse_bim_file(plink_bfile):
                          dtype={
                              'CHR': int,
                              'SNP': str,
-                             'cM': float,
-                             'POS': int,
+                             'cM': np.float32,
+                             'POS': np.int32,
                              'A1': str,
                              'A2': str
                          })
@@ -72,8 +73,8 @@ def parse_fam_file(plink_bfile):
                                 'IID': str,
                                 'fatherID': str,
                                 'motherID': str,
-                                'sex': float,
-                                'phenotype': float
+                                'sex': np.float32,
+                                'phenotype': np.float32
                                 },
                          na_values={
                              'phenotype': [-9.],

diff --git a/magenpy/parsers/sumstats_parsers.py b/magenpy/parsers/sumstats_parsers.py
@@ -68,6 +68,11 @@ def parse(self, file_name, drop_na=True):
         if self.col_name_converter is not None:
             df.rename(columns=self.col_name_converter, inplace=True)
 
+        try:
+            df['POS'] = df['POS'].astype(np.int32)
+        except KeyError:
+            pass
+
         return df
 
 

diff --git a/magenpy/stats/ld/c_utils.pyx b/magenpy/stats/ld/c_utils.pyx
@@ -9,7 +9,8 @@
 # cython: infer_types=True
 
 from libc.math cimport exp
-from cython cimport integral
+from libc.stdint cimport int64_t
+from cython cimport integral, floating
 cimport cython
 import numpy as np
 
@@ -35,8 +36,8 @@ cpdef filter_ut_csr_matrix_low_memory(integral[::1] indptr, char[::1] bool_mask)
 
 
     cdef:
-        long i, curr_row, row_bound, new_indptr_idx = 1, curr_shape=indptr.shape[0] - 1
-        long[::1] new_indptr = np.zeros(np.count_nonzero(bool_mask) + 1, dtype=np.int64)
+        int64_t i, curr_row, row_bound, new_indptr_idx = 1, curr_shape=indptr.shape[0] - 1
+        int64_t[::1] new_indptr = np.zeros(np.count_nonzero(bool_mask) + 1, dtype=np.int64)
         char[::1] data_mask = np.zeros(indptr[curr_shape], dtype=np.int8)
 
     with nogil:
@@ -70,7 +71,7 @@ cpdef filter_ut_csr_matrix_low_memory(integral[::1] indptr, char[::1] bool_mask)
 @cython.wraparound(False)
 @cython.nonecheck(False)
 @cython.exceptval(check=False)
-cpdef expand_ranges(integral[::1] start, integral[::1] end, long output_size):
+cpdef expand_ranges(integral[::1] start, integral[::1] end, int64_t output_size):
     """
     Given a set of start and end indices, expand them into one long vector that contains 
     the indices between all start and end positions.
@@ -83,7 +84,7 @@ cpdef expand_ranges(integral[::1] start, integral[::1] end, long output_size):
 
     cdef:
         integral i, j, size=start.shape[0]
-        long out_idx = 0
+        int64_t out_idx = 0
         integral[::1] output
 
     if integral is int:
@@ -104,7 +105,7 @@ cpdef expand_ranges(integral[::1] start, integral[::1] end, long output_size):
 @cython.nonecheck(False)
 @cython.cdivision(True)
 @cython.exceptval(check=False)
-cpdef find_ld_block_boundaries(integral[:] pos, integral[:, :] block_boundaries):
+cpdef find_ld_block_boundaries(integral[:] pos, int[:, :] block_boundaries):
     """
     Find the LD boundaries for the blockwise estimator of LD, i.e., the 
     indices of the leftmost and rightmost neighbors for each SNP.
@@ -146,7 +147,7 @@ cpdef find_ld_block_boundaries(integral[:] pos, integral[:, :] block_boundaries)
 @cython.nonecheck(False)
 @cython.cdivision(True)
 @cython.exceptval(check=False)
-cpdef find_windowed_ld_boundaries(double[:] pos, double max_dist):
+cpdef find_windowed_ld_boundaries(floating[:] pos, double max_dist):
     """
     Find the LD boundaries for the windowed estimator of LD, i.e., the 
     indices of the leftmost and rightmost neighbors for each SNP.
@@ -180,7 +181,7 @@ cpdef find_windowed_ld_boundaries(double[:] pos, double max_dist):
 @cython.nonecheck(False)
 @cython.cdivision(True)
 @cython.exceptval(check=False)
-cpdef find_shrinkage_ld_boundaries(double[:] cm_pos,
+cpdef find_shrinkage_ld_boundaries(floating[:] cm_pos,
                                   double genmap_ne,
                                   int genmap_sample_size,
                                   double cutoff):