From d9cb10e6bda109bc9b4a6ca8b92d7f3342b56492 Mon Sep 17 00:00:00 2001 From: Xiang Date: Fri, 22 Apr 2022 14:46:14 +0800 Subject: [PATCH] updated configs fixed gradient checkpoint added TODO updated README.md --- recognition/arcface_torch/README.md | 90 +++++++++++-------- .../arcface_torch/backbones/iresnet.py | 6 +- recognition/arcface_torch/configs/base.py | 1 + ...0k_mobileface_bs4k.py => glint360k_mbf.py} | 8 +- ..._r100_bs4k_16gpus.py => glint360k_r100.py} | 10 +-- .../arcface_torch/configs/glint360k_r50.py | 27 ++++++ .../arcface_torch/configs/ms1mv2_mbf.py | 27 ++++++ .../arcface_torch/configs/ms1mv2_r100.py | 27 ++++++ .../arcface_torch/configs/ms1mv2_r50.py | 27 ++++++ .../{ms1mv3_mobileface.py => ms1mv3_mbf.py} | 8 +- .../arcface_torch/configs/ms1mv3_r100.py | 4 +- .../arcface_torch/configs/ms1mv3_r50.py | 6 +- ...y => wf12m_conflict_r50_pfc03_filter04.py} | 0 ...50.py => wf12m_flip_pfc01_filter04_r50.py} | 2 +- .../arcface_torch/configs/wf42m_pfc02_r100.py | 27 ++++++ .../configs/wf42m_pfc02_r100_16gpus.py | 27 ++++++ .../configs/wf42m_pfc02_r100_32gpus.py | 27 ++++++ recognition/arcface_torch/train.py | 1 + 18 files changed, 266 insertions(+), 59 deletions(-) rename recognition/arcface_torch/configs/{glint360k_mobileface_bs4k.py => glint360k_mbf.py} (86%) rename recognition/arcface_torch/configs/{glint360k_r100_bs4k_16gpus.py => glint360k_r100.py} (82%) create mode 100644 recognition/arcface_torch/configs/glint360k_r50.py create mode 100644 recognition/arcface_torch/configs/ms1mv2_mbf.py create mode 100644 recognition/arcface_torch/configs/ms1mv2_r100.py create mode 100644 recognition/arcface_torch/configs/ms1mv2_r50.py rename recognition/arcface_torch/configs/{ms1mv3_mobileface.py => ms1mv3_mbf.py} (86%) rename recognition/arcface_torch/configs/{wf12m_conflict_r50_pfc02_filter04.py => wf12m_conflict_r50_pfc03_filter04.py} (100%) rename recognition/arcface_torch/configs/{wf12m_flip_pfc02_filter04_r50.py => wf12m_flip_pfc01_filter04_r50.py} (96%) create mode 100644 recognition/arcface_torch/configs/wf42m_pfc02_r100.py create mode 100644 recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py create mode 100644 recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py diff --git a/recognition/arcface_torch/README.md b/recognition/arcface_torch/README.md index c24bbce82..b49e9acc6 100644 --- a/recognition/arcface_torch/README.md +++ b/recognition/arcface_torch/README.md @@ -61,47 +61,63 @@ For **ICCV2021-MFR-ALL** set, TAR is measured on all-to-all 1:1 protocal, with F globalised multi-racial testset contains 242,143 identities and 1,624,305 images. -> 1. Large Scale Datasets - -| Datasets | Backbone | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Training Throughout | log | -|:-----------------|:------------|:------------|:------------|:------------|:--------------------|:------------------------------------------------------------------------------------------------------------------------------------------------| -| MS1MV3 | mobileface | 65.76 | 94.44 | 91.85 | ~13000 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_mobileface_lr02/training.log) | -| Glint360K | mobileface | 69.83 | 95.17 | 92.58 | -11000 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_mobileface_lr02_bs4k/training.log) | -| WF42M-PFC-0.2 | mobileface | 73.80 | 95.40 | 92.64 | (16GPUs)~18583 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_mobilefacenet_pfc02_bs8k_16gpus/training.log) | -| MS1MV3 | r100 | 83.23 | 96.88 | 95.31 | ~3400 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r100_lr02/training.log) | -| Glint360K | r100 | 90.86 | 97.53 | 96.43 | ~5000 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r100_lr02_bs4k_16gpus/training.log) | -| WF42M-PFC-0.2 | r50(bs4k) | 93.83 | 97.53 | 96.16 | (8 GPUs)~5900 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_bs4k_pfc02/training.log) | -| WF42M-PFC-0.2 | r50(bs8k) | 93.96 | 97.46 | 96.12 | (16GPUs)~11000 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_lr01_pfc02_bs8k_16gpus/training.log) | -| WF42M-PFC-0.2 | r50(bs4k) | 94.04 | 97.48 | 95.94 | (32GPUs)~17000 | click me | -| WF42M-PFC-0.0018 | r100(bs16k) | 93.08 | 97.51 | 95.88 | (32GPUs)~10000 | click me | -| WF42M-PFC-0.2 | r100(bs4k) | 96.69 | 97.85 | 96.63 | (16GPUs)~5200 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r100_bs4k_pfc02/training.log) | - -> 2. VIT For Face Recognition - -| Datasets | Backbone | FLOPs | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Training Throughout | log | -|:--------------|:-------------|:------|:------------|:------------|:------------|:--------------------|:---------| -| WF42M-PFC-0.3 | R18(bs4k) | 2.6 | 79.13 | 95.77 | 93.36 | - | click me | -| WF42M-PFC-0.3 | R50(bs4k) | 6.3 | 94.03 | 97.48 | 95.94 | - | click me | -| WF42M-PFC-0.3 | R100(bs4k) | 12.1 | 96.69 | 97.82 | 96.45 | - | click me | -| WF42M-PFC-0.3 | R200(bs4k) | 23.5 | 97.70 | 97.97 | 96.93 | - | click me | -| WF42M-PFC-0.3 | VIT-T(bs24k) | 1.5 | 92.24 | 97.31 | 95.97 | (64GPUs)~35000 | click me | -| WF42M-PFC-0.3 | VIT-S(bs24k) | 5.7 | 95.87 | 97.73 | 96.57 | (64GPUs)~25000 | click me | -| WF42M-PFC-0.3 | VIT-B(bs24k) | 11.4 | 97.42 | 97.90 | 97.04 | (64GPUs)~13800 | click me | -| WF42M-PFC-0.3 | VIT-L(bs24k) | 25.3 | 97.85 | 98.00 | 97.23 | (64GPUs)~9406 | click me | - -WF42M means WebFace42M, `PFC-0.3` means negivate class centers sample rate is 0.3. - -> 3. Noisy Datasets +#### 1. Training on Single-Host GPU + +| Datasets | Backbone | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | log | +|:--------------|:--------------------|:------------|:------------|:------------|:------------------------------------------------------------------------------------------------------------------------------------| +| MS1MV2 | mobilefacenet-0.45G | 62.07 | 93.61 | 90.28 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_mbf/training.log) | +| MS1MV3 | mobilefacenet-0.45G | 63.78 | 94.23 | 91.33 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_mbf/training.log) | +| Glint360K | mobilefacenet-0.45G | 70.18 | 95.04 | 92.62 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_mbf/training.log) | +| MS1MV2 | r50 | 70.35 | 95.43 | 93.34 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_r50/training.log) | +| MS1MV3 | r50 | 79.14 | 96.37 | 94.47 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r50/training.log) | +| Glint360K | r50 | 86.34 | 97.16 | 95.81 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r50/training.log) | +| MS1MV2 | r100 | 69.79 | 95.85 | 93.93 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_r100/training.log) | +| MS1MV3 | r100 | 81.97 | 96.85 | 95.02 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r100/training.log) | +| Glint360k | r100 | 89.52 | 97.55 | 96.38 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r100/training.log) | +| WF42M-PFC-0.2 | R100 | 96.27 | 97.70 | 96.31 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf42m_pfc02_r100/training.log) | +| WF42M-PFC-0.2 | ViT-T-1.5G | 92.04 | 97.27 | 95.68 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf42m_pfc02_40epoch_8gpu_vit_t/training.log) | + +#### 2. Training on Multi-Host GPU + +| Datasets | Backbone(bs*gpus) | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Throughout | log | +|:-----------------|:------------------|:------------|:------------|:------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------| +| WF42M-PFC-0.2 | r50(512*8) | 93.83 | 97.53 | 96.16 | ~5900 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_bs4k_pfc02/training.log) | +| WF42M-PFC-0.2 | r50(512*16) | 93.96 | 97.46 | 96.12 | ~11000 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_lr01_pfc02_bs8k_16gpus/training.log) | +| WF42M-PFC-0.2 | r50(128*32) | 94.04 | 97.48 | 95.94 | ~17000 | click me | +| WF42M-PFC-0.2 | r100(128*16) | 96.28 | 97.80 | 96.57 | ~5200 | click me | +| WF42M-PFC-0.2 | r100(256*16) | 96.69 | 97.85 | 96.63 | ~5200 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r100_bs4k_pfc02/training.log) | +| WF42M-PFC-0.0018 | r100(512*32) | 93.08 | 97.51 | 95.88 | ~10000 | click me | +| WF42M-PFC-0.2 | r100(128*32) | 96.57 | 97.83 | 96.50 | ~9800 | click me | + +`r100(128*32)` means backbone is r100, batchsize per gpu is 128, the number of gpus is 32. + + + +#### 3. ViT For Face Recognition + +| Datasets | Backbone(bs) | FLOPs | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Throughout | log | +|:--------------|:--------------|:------|:------------|:------------|:------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------| +| WF42M-PFC-0.3 | r18(128*32) | 2.6 | 79.13 | 95.77 | 93.36 | - | click me | +| WF42M-PFC-0.3 | r50(128*32) | 6.3 | 94.03 | 97.48 | 95.94 | - | click me | +| WF42M-PFC-0.3 | r100(128*32) | 12.1 | 96.69 | 97.82 | 96.45 | - | click me | +| WF42M-PFC-0.3 | r200(128*32) | 23.5 | 97.70 | 97.97 | 96.93 | - | click me | +| WF42M-PFC-0.3 | VIT-T(384*64) | 1.5 | 92.24 | 97.31 | 95.97 | ~35000 | click me | +| WF42M-PFC-0.3 | VIT-S(384*64) | 5.7 | 95.87 | 97.73 | 96.57 | ~25000 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_s_64gpu/training.log) | +| WF42M-PFC-0.3 | VIT-B(384*64) | 11.4 | 97.42 | 97.90 | 97.04 | ~13800 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_b_64gpu/training.log) | +| WF42M-PFC-0.3 | VIT-L(384*64) | 25.3 | 97.85 | 98.00 | 97.23 | ~9406 | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_l_64gpu/training.log) | + +`WF42M` means WebFace42M, `PFC-0.3` means negivate class centers sample rate is 0.3. + +#### 4. Noisy Datasets | Datasets | Backbone | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | log | |:-------------------------|:---------|:------------|:------------|:------------|:---------| -| WF12M-Flip(40%) | R50 | 43.87 | 88.35 | 80.78 | click me | -| WF12M-Flip(40%)-PFC-0.3* | R50 | 80.20 | 96.11 | 93.79 | click me | -| WF12M-Conflict | R50 | 79.93 | 95.30 | 91.56 | click me | -| WF12M-Conflict-PFC-0.3* | R50 | 91.68 | 97.28 | 95.75 | click me | - -WF12M means WebFace12M, `+PFC-0.3*` denotes additional abnormal inter-class filtering. +| WF12M-Flip(40%) | r50 | 43.87 | 88.35 | 80.78 | click me | +| WF12M-Flip(40%)-PFC-0.1* | r50 | 80.20 | 96.11 | 93.79 | click me | +| WF12M-Conflict | r50 | 79.93 | 95.30 | 91.56 | click me | +| WF12M-Conflict-PFC-0.3* | r50 | 91.68 | 97.28 | 95.75 | click me | +`WF12M` means WebFace12M, `+PFC-0.1*` denotes additional abnormal inter-class filtering. diff --git a/recognition/arcface_torch/backbones/iresnet.py b/recognition/arcface_torch/backbones/iresnet.py index 8b00e99de..6f2347c92 100644 --- a/recognition/arcface_torch/backbones/iresnet.py +++ b/recognition/arcface_torch/backbones/iresnet.py @@ -44,7 +44,7 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, self.downsample = downsample self.stride = stride - def forard_impl(self, x): + def forward_impl(self, x): identity = x out = self.bn1(x) out = self.conv1(out) @@ -59,9 +59,9 @@ def forard_impl(self, x): def forward(self, x): if self.training and using_ckpt: - return checkpoint(self.forard_imlp, x) + return checkpoint(self.forward_impl, x) else: - return self.forard_impl(x) + return self.forward_impl(x) class IResNet(nn.Module): diff --git a/recognition/arcface_torch/configs/base.py b/recognition/arcface_torch/configs/base.py index 49c9a8379..897da60de 100644 --- a/recognition/arcface_torch/configs/base.py +++ b/recognition/arcface_torch/configs/base.py @@ -6,6 +6,7 @@ config = edict() +# Margin Base Softmax config.margin_list = (1.0, 0.5, 0.0) config.network = "r50" config.resume = False diff --git a/recognition/arcface_torch/configs/glint360k_mobileface_bs4k.py b/recognition/arcface_torch/configs/glint360k_mbf.py similarity index 86% rename from recognition/arcface_torch/configs/glint360k_mobileface_bs4k.py rename to recognition/arcface_torch/configs/glint360k_mbf.py index fe6d01299..b32f0016b 100644 --- a/recognition/arcface_torch/configs/glint360k_mobileface_bs4k.py +++ b/recognition/arcface_torch/configs/glint360k_mbf.py @@ -14,14 +14,14 @@ config.fp16 = True config.momentum = 0.9 config.weight_decay = 1e-4 -config.batch_size = 512 -config.lr = 0.4 -config.verbose = 5000 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 config.dali = False config.rec = "/train_tmp/glint360k" config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 -config.warmup_epoch = 2 +config.warmup_epoch = 0 config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/glint360k_r100_bs4k_16gpus.py b/recognition/arcface_torch/configs/glint360k_r100.py similarity index 82% rename from recognition/arcface_torch/configs/glint360k_r100_bs4k_16gpus.py rename to recognition/arcface_torch/configs/glint360k_r100.py index 5cb739891..3b8bbb786 100644 --- a/recognition/arcface_torch/configs/glint360k_r100_bs4k_16gpus.py +++ b/recognition/arcface_torch/configs/glint360k_r100.py @@ -13,15 +13,15 @@ config.sample_rate = 1.0 config.fp16 = True config.momentum = 0.9 -config.weight_decay = 5e-4 -config.batch_size = 256 -config.lr = 0.4 -config.verbose = 5000 +config.weight_decay = 1e-4 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 config.dali = False config.rec = "/train_tmp/glint360k" config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 -config.warmup_epoch = 2 +config.warmup_epoch = 0 config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/glint360k_r50.py b/recognition/arcface_torch/configs/glint360k_r50.py new file mode 100644 index 000000000..4eeb28f84 --- /dev/null +++ b/recognition/arcface_torch/configs/glint360k_r50.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.0, 0.4) +config.network = "r50" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 1.0 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 1e-4 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 +config.dali = False + +config.rec = "/train_tmp/glint360k" +config.num_classes = 360232 +config.num_image = 17091657 +config.num_epoch = 20 +config.warmup_epoch = 0 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/ms1mv2_mbf.py b/recognition/arcface_torch/configs/ms1mv2_mbf.py new file mode 100644 index 000000000..255a51ad6 --- /dev/null +++ b/recognition/arcface_torch/configs/ms1mv2_mbf.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.5, 0.0) +config.network = "mbf" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 1.0 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 1e-4 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 +config.dali = False + +config.rec = "/train_tmp/faces_emore" +config.num_classes = 85742 +config.num_image = 5822653 +config.num_epoch = 40 +config.warmup_epoch = 0 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/ms1mv2_r100.py b/recognition/arcface_torch/configs/ms1mv2_r100.py new file mode 100644 index 000000000..2978d2dfe --- /dev/null +++ b/recognition/arcface_torch/configs/ms1mv2_r100.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.5, 0.0) +config.network = "r100" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 1.0 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 1e-4 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 +config.dali = False + +config.rec = "/train_tmp/faces_emore" +config.num_classes = 85742 +config.num_image = 5822653 +config.num_epoch = 20 +config.warmup_epoch = 0 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/ms1mv2_r50.py b/recognition/arcface_torch/configs/ms1mv2_r50.py new file mode 100644 index 000000000..80d6f1168 --- /dev/null +++ b/recognition/arcface_torch/configs/ms1mv2_r50.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.5, 0.0) +config.network = "r50" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 1.0 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 1e-4 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 +config.dali = False + +config.rec = "/train_tmp/faces_emore" +config.num_classes = 85742 +config.num_image = 5822653 +config.num_epoch = 20 +config.warmup_epoch = 0 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/ms1mv3_mobileface.py b/recognition/arcface_torch/configs/ms1mv3_mbf.py similarity index 86% rename from recognition/arcface_torch/configs/ms1mv3_mobileface.py rename to recognition/arcface_torch/configs/ms1mv3_mbf.py index 5a6dda2d8..731b4a261 100644 --- a/recognition/arcface_torch/configs/ms1mv3_mobileface.py +++ b/recognition/arcface_torch/configs/ms1mv3_mbf.py @@ -14,14 +14,14 @@ config.fp16 = True config.momentum = 0.9 config.weight_decay = 1e-4 -config.batch_size = 256 -config.lr = 0.2 -config.verbose = 5000 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 2000 config.dali = False config.rec = "/train_tmp/ms1m-retinaface-t1" config.num_classes = 93431 config.num_image = 5179510 config.num_epoch = 40 -config.warmup_epoch = 2 +config.warmup_epoch = 0 config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/ms1mv3_r100.py b/recognition/arcface_torch/configs/ms1mv3_r100.py index b11373fa8..e7af3cef4 100644 --- a/recognition/arcface_torch/configs/ms1mv3_r100.py +++ b/recognition/arcface_torch/configs/ms1mv3_r100.py @@ -15,13 +15,13 @@ config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 -config.lr = 0.2 +config.lr = 0.1 config.verbose = 2000 config.dali = False config.rec = "/train_tmp/ms1m-retinaface-t1" config.num_classes = 93431 config.num_image = 5179510 -config.num_epoch = 25 +config.num_epoch = 20 config.warmup_epoch = 0 config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/ms1mv3_r50.py b/recognition/arcface_torch/configs/ms1mv3_r50.py index 4946adef9..f1467f0a5 100644 --- a/recognition/arcface_torch/configs/ms1mv3_r50.py +++ b/recognition/arcface_torch/configs/ms1mv3_r50.py @@ -15,13 +15,13 @@ config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 -config.lr = 0.2 +config.lr = 0.1 config.verbose = 2000 config.dali = False config.rec = "/train_tmp/ms1m-retinaface-t1" config.num_classes = 93431 config.num_image = 5179510 -config.num_epoch = 25 -config.warmup_epoch = 2 +config.num_epoch = 20 +config.warmup_epoch = 0 config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/wf12m_conflict_r50_pfc02_filter04.py b/recognition/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py similarity index 100% rename from recognition/arcface_torch/configs/wf12m_conflict_r50_pfc02_filter04.py rename to recognition/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py diff --git a/recognition/arcface_torch/configs/wf12m_flip_pfc02_filter04_r50.py b/recognition/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py similarity index 96% rename from recognition/arcface_torch/configs/wf12m_flip_pfc02_filter04_r50.py rename to recognition/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py index 6ad633383..2c1018b7f 100644 --- a/recognition/arcface_torch/configs/wf12m_flip_pfc02_filter04_r50.py +++ b/recognition/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py @@ -10,7 +10,7 @@ config.resume = False config.output = None config.embedding_size = 512 -config.sample_rate = 0.2 +config.sample_rate = 0.1 config.interclass_filtering_threshold = 0.4 config.fp16 = True config.weight_decay = 5e-4 diff --git a/recognition/arcface_torch/configs/wf42m_pfc02_r100.py b/recognition/arcface_torch/configs/wf42m_pfc02_r100.py new file mode 100644 index 000000000..5274a52f2 --- /dev/null +++ b/recognition/arcface_torch/configs/wf42m_pfc02_r100.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.0, 0.4) +config.network = "r100" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 0.2 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 5e-4 +config.batch_size = 128 +config.lr = 0.1 +config.verbose = 10000 +config.dali = False + +config.rec = "/train_tmp/WebFace42M" +config.num_classes = 2059906 +config.num_image = 42474557 +config.num_epoch = 20 +config.warmup_epoch = 0 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py b/recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py new file mode 100644 index 000000000..c1e8f1991 --- /dev/null +++ b/recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.0, 0.4) +config.network = "r100" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 0.2 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 5e-4 +config.batch_size = 128 +config.lr = 0.2 +config.verbose = 10000 +config.dali = False + +config.rec = "/train_tmp/WebFace42M" +config.num_classes = 2059906 +config.num_image = 42474557 +config.num_epoch = 20 +config.warmup_epoch = config.num_epoch // 10 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py b/recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py new file mode 100644 index 000000000..f7787675d --- /dev/null +++ b/recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py @@ -0,0 +1,27 @@ +from easydict import EasyDict as edict + +# make training faster +# our RAM is 256G +# mount -t tmpfs -o size=140G tmpfs /train_tmp + +config = edict() +config.margin_list = (1.0, 0.0, 0.4) +config.network = "r100" +config.resume = False +config.output = None +config.embedding_size = 512 +config.sample_rate = 0.2 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 5e-4 +config.batch_size = 128 +config.lr = 0.4 +config.verbose = 10000 +config.dali = False + +config.rec = "/train_tmp/WebFace42M" +config.num_classes = 2059906 +config.num_image = 42474557 +config.num_epoch = 20 +config.warmup_epoch = config.num_epoch // 10 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] diff --git a/recognition/arcface_torch/train.py b/recognition/arcface_torch/train.py index a7198c80e..0b8de293b 100644 --- a/recognition/arcface_torch/train.py +++ b/recognition/arcface_torch/train.py @@ -80,6 +80,7 @@ def main(args): margin_loss, cfg.embedding_size, cfg.num_classes, cfg.sample_rate, cfg.fp16) module_partial_fc.train().cuda() + # TODO the params of partial fc must be last in the params list opt = torch.optim.SGD( params=[{"params": backbone.parameters()}, {"params": module_partial_fc.parameters()}], lr=cfg.lr, momentum=0.9, weight_decay=cfg.weight_decay)