Skip to content

Commit 3563181

Browse files
committed
Update to new version of bgen-reader and fix pass all tests.
1 parent e83a60e commit 3563181

File tree

9 files changed

+78
-65
lines changed

9 files changed

+78
-65
lines changed

pysnptools/distreader/bgen.py

+27-17
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,16 @@ def _apply_iid_function(self,samples):
145145
if self._iid_function is not default_iid_function:
146146
return np.array([self._iid_function(sample) for sample in samples],dtype='str')
147147
else:
148-
samples_np = np.stack(np.core.defchararray.split(samples,',',maxsplit=2))
148+
try:
149+
samples_np = np.stack(np.core.defchararray.split(samples,',',maxsplit=2))
150+
except:
151+
def split_and_fill(s):
152+
fields = s.split(',',2)
153+
if len(fields)==1:
154+
return ['0',fields[0]]
155+
else:
156+
return fields
157+
samples_np = np.array([split_and_fill(sample) for sample in samples])
149158
if samples_np.shape[1]==1:
150159
samples_np = np.stack([np.full(samples.shape,'0'),samples_np.reshape(-1)],axis=1)
151160
row = samples_np
@@ -173,14 +182,14 @@ def _run_once(self):
173182

174183
assert os.path.exists(self.filename), "Expect file to exist ('{0}')".format(self.filename)
175184
#!!!cmkassert os.path.getsize(self.filename)<2**31, "For now, Python cannot access files larger than about 2G bytes (see https://github.com/limix/bgen-reader-py/issues/29)"
176-
verbose = logging.getLogger().level >= logging.INFO
185+
verbose = logging.getLogger().level <= logging.INFO
177186

178-
self._bgen = open_bgen(self.filename,self._sample,verbose)
179-
self._row = self._apply_iid_function(self._bgen.samples)
180-
self._col = self._apply_sid_function(self._bgen.ids,self._bgen.rsids)
187+
self._open_bgen = open_bgen(self.filename,self._sample,verbose)
188+
self._row = self._apply_iid_function(self._open_bgen.samples)
189+
self._col = self._apply_sid_function(self._open_bgen.ids,self._open_bgen.rsids)
181190
self._col_property = np.zeros((len(self._col),3),dtype='float')
182-
self._col_property[:,0] = self._bgen.chromosomes
183-
self._col_property[:,2] = self._bgen.positions
191+
self._col_property[:,0] = self._open_bgen.chromosomes
192+
self._col_property[:,2] = self._open_bgen.positions
184193

185194
self._assert_iid_sid_pos(check_val=False)
186195

@@ -191,8 +200,8 @@ def _read(self, iid_index_or_none, sid_index_or_none, order, dtype, force_python
191200
if order=='A':
192201
order='F'
193202

194-
#cmk assert self._bgen.nalleles unique is 2, phased is all false, ploidy is 2
195-
val = self._bgen.read((iid_index_or_none,sid_index_or_none),dtype=dtype,order=order)
203+
#cmk assert self._open_bgen.nalleles unique is 2, phased is all false, ploidy is 2
204+
val = self._open_bgen.read((iid_index_or_none,sid_index_or_none),dtype=dtype,order=order)
196205
return val
197206

198207
def __repr__(self):
@@ -214,8 +223,8 @@ def flush(self):
214223
'''
215224
if hasattr(self,'_ran_once') and self._ran_once:
216225
self._ran_once = False
217-
if hasattr(self,'_bgen') and self._bgen is not None: # we need to test this because Python doesn't guarantee that __init__ was fully run
218-
del self._bgen
226+
if hasattr(self,'_bgen') and self._open_bgen is not None: # we need to test this because Python doesn't guarantee that __init__ was fully run
227+
del self._open_bgen
219228

220229
@staticmethod
221230
def write(filename, distreader, bits=16, compression=None, sample_function=default_sample_function, id_rsid_function=default_id_rsid_function, iid_function=default_iid_function, sid_function=default_sid_function, block_size=None, qctool_path=None, cleanup_temp_files=True):
@@ -272,7 +281,7 @@ def write(filename, distreader, bits=16, compression=None, sample_function=defau
272281
dir, file = os.path.split(filename)
273282
if dir=='':
274283
dir='.'
275-
metadatanpz = file+'.metadata.npz'
284+
metadatanpz = open_bgen._metadatapath_from_filename(file)
276285
samplefile = os.path.splitext(file)[0]+'.sample'
277286
genfile = os.path.splitext(file)[0]+'.gen'
278287
olddir = os.getcwd()
@@ -374,7 +383,7 @@ def copyinputs(self, copier):
374383
copier.input(self.filename)
375384
if self._sample is not None:
376385
copier.input(self._sample)
377-
metadata2 = self.filename + ".metadata.npz"
386+
metadata2 = open_bgen._metadatapath_from_filename(self.filename)
378387
if os.path.exists(metadata2):
379388
copier.input(metadata2)
380389

@@ -520,8 +529,9 @@ def test_read1(self):
520529
pstutil.create_directory_if_necessary(file_to)
521530
if os.path.exists(file_to+".metadata"):
522531
os.remove(file_to+".metadata")
523-
if os.path.exists(file_to+".metadata.npz"):
524-
os.remove(file_to+".metadata.npz")
532+
meta = open_bgen._metadatapath_from_filename(file_to)
533+
if os.path.exists(meta):
534+
os.remove(meta)
525535
shutil.copy(file_from,file_to)
526536

527537
for loop_index in range(2):
@@ -542,12 +552,12 @@ def test_read1(self):
542552
bgen = Bgen(file_to,iid_function,sid_function=sid_function)
543553
assert bgen.sid[0]=='SNPID_2,RSID_2'
544554

545-
os.remove(file_to+".metadata.npz")
555+
os.remove(bgen._open_bgen._metadatapath_from_filename(file_to))
546556
sid_function = lambda id,rsid: '{0},{1}'.format(id,rsid)
547557
bgen = Bgen(file_to,iid_function,sid_function=sid_function)
548558
assert bgen.sid[0]=='SNPID_2,RSID_2'
549559

550-
os.remove(file_to+".metadata.npz")
560+
os.remove(bgen._open_bgen._metadatapath_from_filename(file_to))
551561
bgen = Bgen(file_to,iid_function,sid_function='rsid')
552562
assert np.array_equal(bgen.iid[0],['sample_001', 'sample_001'])
553563
assert bgen.sid[0]=='RSID_2'

pysnptools/distreader/test.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -165,13 +165,13 @@ def test_subset_view(self):
165165
result = distreader2.read(view_ok=True)
166166
self.assertFalse(distreader2 is result)
167167
result2 = result[:,:].read()
168-
self.assertFalse(sp.may_share_memory(result2.val,result.val))
168+
self.assertFalse(np.may_share_memory(result2.val,result.val))
169169
result3 = result[:,:].read(view_ok=True)
170-
self.assertTrue(sp.may_share_memory(result3.val,result.val))
170+
self.assertTrue(np.may_share_memory(result3.val,result.val))
171171
result4 = result3.read()
172-
self.assertFalse(sp.may_share_memory(result4.val,result3.val))
172+
self.assertFalse(np.may_share_memory(result4.val,result3.val))
173173
result5 = result4.read(view_ok=True)
174-
self.assertTrue(sp.may_share_memory(result4.val,result5.val))
174+
self.assertTrue(np.may_share_memory(result4.val,result5.val))
175175

176176

177177

@@ -231,7 +231,7 @@ def test_writes(self):
231231
shutil.rmtree(filename)
232232
ret = writer(filename,distdata)
233233
assert ret is not None
234-
for subsetter in [None, sp.s_[::2,::3]]:
234+
for subsetter in [None, np.s_[::2,::3]]:
235235
reader = constructor(filename)
236236
_fortesting_JustCheckExists().input(reader)
237237
subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]]
@@ -479,11 +479,11 @@ def factory_iterator():
479479

480480
for iid_index_list in [range(N_original), range(N_original//2), range(N_original - 1,0,-2)]:
481481
for snp_index_list in [range(snps_to_read_count), range(snps_to_read_count//2), range(snps_to_read_count - 1,0,-2)]:
482-
reference_snps, reference_dtype = TestDistNaNCNC(iid_index_list, snp_index_list, snp_reader_factory_distnpz(), sp.float64, "C", "False", None, None).read_and_standardize()
482+
reference_snps, reference_dtype = TestDistNaNCNC(iid_index_list, snp_index_list, snp_reader_factory_distnpz(), np.float64, "C", "False", None, None).read_and_standardize()
483483
for distreader_factory in [snp_reader_factory_distnpz,
484484
snp_reader_factory_snpmajor_hdf5, snp_reader_factory_iidmajor_hdf5
485485
]:
486-
for dtype in [sp.float64,sp.float32]:
486+
for dtype in [np.float64,np.float32]:
487487
for order in ["C", "F"]:
488488
for force_python_only in [False, True]:
489489
distreader = distreader_factory()
@@ -529,7 +529,7 @@ def runTest(self, result = None):
529529
assert not np.array_equal(snps[0,0],snps[0,0]) #without SnpReader's standardization NaN's stay NaN's
530530
assert np.allclose(snps[0,1],[.1,.2,.7])
531531
if self.reference_snps is not None:
532-
self.assertTrue(np.allclose(self.reference_snps, snps, rtol=1e-04 if dtype == sp.float32 or self.reference_dtype == sp.float32 else 1e-12,equal_nan=True))
532+
self.assertTrue(np.allclose(self.reference_snps, snps, rtol=1e-04 if dtype == np.float32 or self.reference_dtype == np.float32 else 1e-12,equal_nan=True))
533533

534534

535535
# We do it this way instead of using doctest.DocTestSuite because doctest.DocTestSuite requires modules to be pickled, which python doesn't allow.
@@ -604,7 +604,7 @@ def getTestSuite():
604604
return test_suite
605605

606606
if __name__ == '__main__':
607-
logging.basicConfig(level=logging.INFO)
607+
logging.basicConfig(level=logging.WARN)
608608

609609
if False:
610610
from pysnptools.snpreader import Bed

pysnptools/pstreader/test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _zero(c):
105105
the_class.write(filename,pstdata)
106106
reader = the_class(filename) if suffix!='hdf5' else the_class(filename,block_size=3)
107107
_fortesting_JustCheckExists().input(reader)
108-
for subsetter in [None, sp.s_[::2,::3]]:
108+
for subsetter in [None, np.s_[::2,::3]]:
109109
subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]]
110110
expected = pstdata if subsetter is None else pstdata[subsetter[0],subsetter[1]].read()
111111
for order in ['C','F','A']:

pysnptools/pysnptools.pyproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<SchemaVersion>2.0</SchemaVersion>
66
<ProjectGuid>{60f1cb4a-9da3-47c2-9b89-60ac1ce93347}</ProjectGuid>
77
<ProjectHome />
8-
<StartupFile>distreader\bgen.py</StartupFile>
8+
<StartupFile>distreader\test.py</StartupFile>
99
<SearchPath>..\;..\..\bgen-reader-py</SearchPath>
1010
<WorkingDirectory>.\snpreader</WorkingDirectory>
1111
<OutputPath>.</OutputPath>

pysnptools/test.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ def test_some_std(self):
376376

377377
from pysnptools.standardizer.identity import Identity
378378
from pysnptools.standardizer.diag_K_to_N import DiagKtoN
379-
for dtype in [sp.float64,sp.float32]:
379+
for dtype in [np.float64,np.float32]:
380380
for std in [Unit(),Beta(1,25),Identity(),DiagKtoN()]:
381381
s = str(std)
382382
np.random.seed(0)
@@ -418,7 +418,7 @@ def standardize(self,snpreader):
418418
make sure blocked standardize yields same result as regular standardize
419419
"""
420420

421-
for dtype in [sp.float64,sp.float32]:
421+
for dtype in [np.float64,np.float32]:
422422

423423
snps = snpreader.read(order='F',force_python_only=True,dtype=dtype).val
424424
self.assertEqual(dtype, snps.dtype)
@@ -562,13 +562,13 @@ def test_subset_view(self):
562562
result = snpreader2.read(view_ok=True)
563563
self.assertFalse(snpreader2 is result)
564564
result2 = result[:,:].read()
565-
self.assertFalse(sp.may_share_memory(result2.val,result.val))
565+
self.assertFalse(np.may_share_memory(result2.val,result.val))
566566
result3 = result[:,:].read(view_ok=True)
567-
self.assertTrue(sp.may_share_memory(result3.val,result.val))
567+
self.assertTrue(np.may_share_memory(result3.val,result.val))
568568
result4 = result3.read()
569-
self.assertFalse(sp.may_share_memory(result4.val,result3.val))
569+
self.assertFalse(np.may_share_memory(result4.val,result3.val))
570570
result5 = result4.read(view_ok=True)
571-
self.assertTrue(sp.may_share_memory(result4.val,result5.val))
571+
self.assertTrue(np.may_share_memory(result4.val,result5.val))
572572

573573
def test_load_and_standardize_hdf5(self):
574574
snpreader2 = SnpHdf5(self.currentFolder + "/examples/toydata.snpmajor.snp.hdf5")
@@ -610,7 +610,7 @@ def load_and_standardize(self, snpreader2, snpreader3):
610610
iid_index_list = range(N_original - 1,0,-2)
611611
snpreader3 = snpreader3[iid_index_list,:]
612612

613-
for dtype in [sp.float64,sp.float32]:
613+
for dtype in [np.float64,np.float32]:
614614

615615
G2 = snpreader2.read(order='F',force_python_only=True).val
616616
G2 = Unit().standardize(G2, block_size=10000, force_python_only=True)
@@ -808,7 +808,7 @@ def test_writes(self):
808808
shutil.rmtree(filename)
809809
ret = writer(filename,snpdata)
810810
assert ret is not None
811-
for subsetter in [None, sp.s_[::2,::3]]:
811+
for subsetter in [None, np.s_[::2,::3]]:
812812
reader = constructor(filename)
813813
_fortesting_JustCheckExists().input(reader)
814814
subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]]
@@ -875,12 +875,12 @@ def factory_iterator():
875875
for iid_index_list in [range(N_original), range(N_original//2), range(N_original - 1,0,-2)]:
876876
for snp_index_list in [range(snps_to_read_count), range(snps_to_read_count//2), range(snps_to_read_count - 1,0,-2)]:
877877
for standardizer in [Unit(),Beta(1,25)]:
878-
reference_snps, reference_dtype = NaNCNCTestCases(iid_index_list, snp_index_list, standardizer, snp_reader_factory_bed(), sp.float64, "C", "False", None, None).read_and_standardize()
878+
reference_snps, reference_dtype = NaNCNCTestCases(iid_index_list, snp_index_list, standardizer, snp_reader_factory_bed(), np.float64, "C", "False", None, None).read_and_standardize()
879879
for snpreader_factory in [snp_reader_factory_bed,
880880
snp_reader_factory_snpmajor_hdf5, snp_reader_factory_iidmajor_hdf5,
881881
snp_reader_factory_dat
882882
]:
883-
for dtype in [sp.float64,sp.float32]:
883+
for dtype in [np.float64,np.float32]:
884884
for order in ["C", "F"]:
885885
for force_python_only in [False, True]:
886886
snpreader = snpreader_factory()
@@ -929,7 +929,7 @@ def runTest(self, result = None):
929929
self.assertTrue(snps[0,0] == 0)
930930
self.assertTrue(np.all(snps[:,1] == 0))
931931
if self.reference_snps is not None:
932-
self.assertTrue(np.allclose(self.reference_snps, snps, rtol=1e-04 if dtype == sp.float32 or self.reference_dtype == sp.float32 else 1e-12))
932+
self.assertTrue(np.allclose(self.reference_snps, snps, rtol=1e-04 if dtype == np.float32 or self.reference_dtype == np.float32 else 1e-12))
933933

934934

935935

0 commit comments

Comments
 (0)