-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_pannuke.py
129 lines (92 loc) · 5.31 KB
/
process_pannuke.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Credit: This code is modified from the original code {https://github.com/PathologyFoundation/plip/blob/main/reproducibility/generate_validation_datasets}
# =============================================================================
import sys, os, platform, copy
opj = os.path.join
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageFile
import shutil
import warnings
warnings.filterwarnings("ignore")
import multiprocess as mp
ImageFile.LOAD_TRUNCATED_IMAGES = True
if __name__ == '__main__':
cwd = os.getcwd()
assert cwd.endswith('pannuke'), f"Please make sure this script is in main 'pannuke' dataset directory and run it from the 'pannuke' directory. Current working directory is: {cwd}"
print("Load npy files ...")
fold1_imgs = np.load(opj(cwd, 'Fold 1', 'images', 'fold1', 'images.npy')).astype(np.uint8)
fold1_msks = np.load(opj(cwd, 'Fold 1', 'masks', 'fold1', 'masks.npy')).astype(np.uint8)
fold1_typs = np.load(opj(cwd, 'Fold 1', 'images', 'fold1', 'types.npy'))
fold2_imgs = np.load(opj(cwd, 'Fold 2', 'images', 'fold2', 'images.npy')).astype(np.uint8)
fold2_msks = np.load(opj(cwd, 'Fold 2', 'masks', 'fold2', 'masks.npy')).astype(np.uint8)
fold2_typs = np.load(opj(cwd, 'Fold 2', 'images', 'fold2', 'types.npy'))
fold3_imgs = np.load(opj(cwd, 'Fold 3', 'images', 'fold3', 'images.npy')).astype(np.uint8)
fold3_msks = np.load(opj(cwd, 'Fold 3', 'masks', 'fold3', 'masks.npy')).astype(np.uint8)
fold3_typs = np.load(opj(cwd, 'Fold 3', 'images', 'fold3', 'types.npy'))
imgs = np.concatenate([fold1_imgs, fold2_imgs, fold3_imgs], axis=0)
msks = np.concatenate([fold1_msks, fold2_msks, fold3_msks], axis=0)
typs = np.concatenate([fold1_typs, fold2_typs, fold3_typs], axis=0)
print("Finished loading.")
# Drop images that contains no cells
idx = np.sum(msks[..., 0:5].reshape(len(msks), -1), axis=1) == 0
print(f'{np.sum(idx)} images are purely background. Drop them.')
imgs = imgs[~idx]
msks = msks[~idx]
typs = typs[~idx]
print(f'Total images: {len(imgs)}')
"""
### Optional: Get the count of specific nuclei for each image
### 0: Neoplastic cells, 1: Inflammatory, 2: Connective/Soft tissue cells, 3: Dead Cells, 4: Epithelial, 6: Background
n_nuclei = {}
for i in range(6):
n_nuclei[i] = np.sum(msks[..., i].reshape(msks.shape[0], -1), axis=1)
"""
stat_ncells = pd.DataFrame(index=np.arange(len(imgs)), columns = np.arange(6))
for i in tqdm(range(len(imgs))):
for j in range(6):
uniq_cells = len(np.unique(msks[..., j].reshape(msks.shape[0], -1)[i,:]))-1
stat_ncells.loc[i, j] = uniq_cells
total_cells = stat_ncells.sum(axis=1)
print('Number of images contain neoplastic cells:', np.sum(stat_ncells[0] > 0), '/', len(imgs))
print('Number of images contain inflammatory cells:', np.sum(stat_ncells[1] > 0), '/', len(imgs))
print('Number of images contain epithelial cells:', np.sum(stat_ncells[4] > 0), '/', len(imgs))
print('----------------------------------------------------------------------------------')
print('Threshold to determine tumor: n_tumor >= 10, and at least 30% of cells are tumors.')
tumor_idx = (stat_ncells[0] >= 10) & (stat_ncells[0]/total_cells > 0.3)
print('Number of tumor images:', np.sum(tumor_idx), '/', len(imgs))
print('----------------------------------------------------------------------------------')
print('Threshold to determine benign: n_tumor == 0')
benign_idx = (stat_ncells[0] == 0)
print('Number of tumor images:', np.sum(benign_idx), '/', len(imgs))
imgs_malignant, imgs_benign = imgs[tumor_idx, ...], imgs[benign_idx, ...]
typs_malignant, typs_benign = typs[tumor_idx, ...], typs[benign_idx, ...]
uniq_types = np.unique(typs)
for ttype in uniq_types:
print(f'{ttype}\t maligant: {np.sum(typs_malignant == ttype)}\t benign: {np.sum(typs_benign == ttype)}')
savedir = opj(cwd, 'processed_threshold=10_0.3', 'images')
os.makedirs(savedir, exist_ok=True)
df = pd.DataFrame()
for i in tqdm(range(len(imgs_malignant))):
img = Image.fromarray(imgs_malignant[i, ...])
tissue = str(typs_malignant[i, ...]).lower().replace('_', ' ')
fname = '%s_malignant_%04d.png' % (tissue, i)
img.save(opj(savedir, fname))
caption = 'An H&E image of malignant %s tissue.' % tissue
row = pd.DataFrame({'image': opj(savedir, fname),
'caption': caption}, index=[i])
df = pd.concat([df, row], axis=0)
for i in tqdm(range(len(imgs_benign))):
img = Image.fromarray(imgs_benign[i, ...])
tissue = str(typs_benign[i, ...]).lower().replace('_', ' ')
fname = '%s_benign_%04d.png' % (tissue, i)
img.save(opj(savedir, fname))
caption = 'An H&E image of benign %s tissue.' % tissue
row = pd.DataFrame({'image': opj(savedir, fname),
'caption': caption}, index=[i])
df = pd.concat([df, row], axis=0)
df.to_csv(opj(cwd, 'processed_threshold=10_0.3', 'PanNuke_all_binary.csv'))
shutil.rmtree(opj(cwd, 'Fold 1'))
shutil.rmtree(opj(cwd, 'Fold 2'))
shutil.rmtree(opj(cwd, 'Fold 3'))
print('Finished processing.')