Skip to content

Commit

Permalink
FaceID Part 2
Browse files Browse the repository at this point in the history
  • Loading branch information
oguzhanbsolak committed Dec 6, 2023
1 parent 1443ed1 commit 9109fd0
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 38 deletions.
52 changes: 30 additions & 22 deletions datasets/vggface2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,40 @@

import os
import glob
import torch
import numpy as np
import errno
import pickle

import numpy as np
import cv2
from tqdm import tqdm
from PIL import Image
import pickle


import torch
from torch.utils.data import Dataset
from torchvision import transforms
import torchvision.transforms.functional as FT
import ai8x
from torch.utils.data import Dataset

from utils import augmentation_utils
from skimage import transform as trans
from hawk_eyes.face import RetinaFace
import kornia.geometry.transform as GT

import ai8x


class VGGFace2(Dataset):
"""
VGGFace2 Dataset
"""
def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=None, img_size=(112, 112), args=None):
def __init__(self, root_dir, d_type, mode, transform=None,
teacher_transform=None, img_size=(112, 112)):

if d_type not in ('test', 'train'):
raise ValueError("d_type can only be set to 'test' or 'train'")

if mode not in ('detection', 'identification', 'identification_dr'):
raise ValueError("mode can only be set to 'detection', 'identification', or 'identification_dr'")
raise ValueError("mode can only be set to 'detection', 'identification',"
"or 'identification_dr'")

self.root_dir = root_dir
self.d_type = d_type
Expand All @@ -65,7 +69,8 @@ def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=Non
self.__makedir_exist_ok(os.path.join(self.dataset_path, "processed"))

if self.d_type in ('train', 'test'):
self.gt_path = os.path.join(self.dataset_path, "processed", self.d_type+"_vggface2.pickle")
self.gt_path = os.path.join(self.dataset_path, "processed",
self.d_type+"_vggface2.pickle")
self.d_path = os.path.join(self.dataset_path, self.d_type)
if not os.path.exists(self.gt_path):
assert os.path.isdir(self.d_path), (f'No dataset at {self.d_path}.\n'
Expand All @@ -81,7 +86,7 @@ def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=Non
' - test \n')

print("Extracting ground truth from the " + self.d_type + " set")
self.__extract_gt(args)
self.__extract_gt()

f = open(self.gt_path, 'rb')
self.pickle_dict = pickle.load(f)
Expand All @@ -92,7 +97,7 @@ def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=Non
return


def __extract_gt(self, args):
def __extract_gt(self):
"""
Extracts the ground truth from the dataset
"""
Expand Down Expand Up @@ -146,10 +151,10 @@ def __getitem__(self, index):
if self.mode == 'detection':
return self.__getitem_detection(index)

elif self.mode == 'identification':
if self.mode == 'identification':
return self.__getitem_identification(index)

elif self.mode == 'identification_dr':
if self.mode == 'identification_dr':
return self.__getitem_identification_dr(index)


Expand All @@ -164,7 +169,8 @@ def __getitem_detection(self, index):
boxes = self.pickle_dict["boxes"][index]
boxes = torch.as_tensor(boxes, dtype=torch.float32)

img, boxes = augmentation_utils.resize(img, boxes, dims=(self.img_size[0], self.img_size[1]))
img, boxes = augmentation_utils.resize(img, boxes,
dims=(self.img_size[0], self.img_size[1]))

labels = [1] * boxes.shape[0]

Expand Down Expand Up @@ -217,7 +223,8 @@ def __getitem_identification(self, index):
box[3] = np.clip(box[3] + (max_dim - height) / 2, 0, img.shape[1])

#Crop image with the square bounding box
img_C = FT.crop(img= img, top=int(box[1]), left=int(box[0]), height=int(box[3]-box[1]), width=int(box[2]-box[0]))
img_C = FT.crop(img= img, top=int(box[1]), left=int(box[0]),
height=int(box[3]-box[1]), width=int(box[2]-box[0]))

#Check if the cropped image is square, if not, pad it
_, h, w = img_C.shape
Expand Down Expand Up @@ -308,7 +315,8 @@ def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size
train_transform = transforms.Compose([
transforms.ToTensor(),
transforms.RandomHorizontalFlip(p=0.5),
transforms.ColorJitter(brightness=(0.6,1.4),saturation=(0.6,1.4),contrast=(0.6,1.4),hue=(-0.4,0.4)),
transforms.ColorJitter(brightness=(0.6,1.4),
saturation=(0.6,1.4),contrast=(0.6,1.4),hue=(-0.4,0.4)),
transforms.RandomErasing(p=0.1),
ai8x.normalize(args=args)
])
Expand All @@ -321,7 +329,7 @@ def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size

train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='identification',
transform=train_transform, teacher_transform=teacher_transform,
img_size=img_size, args=args)
img_size=img_size)

print(f'Train dataset length: {len(train_dataset)}\n')
else:
Expand All @@ -333,7 +341,7 @@ def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size

test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='identification',
transform=test_transform, teacher_transform=teacher_transform,
img_size=img_size, args=args)
img_size=img_size)

print(f'Test dataset length: {len(test_dataset)}\n')
else:
Expand All @@ -356,7 +364,7 @@ def VGGFace2_FaceID_dr_get_datasets(data, load_train=True, load_test=True, img_s
if load_train:

train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='identification_dr',
transform=train_transform, img_size=img_size, args=args)
transform=train_transform, img_size=img_size)

print(f'Train dataset length: {len(train_dataset)}\n')
else:
Expand All @@ -367,7 +375,7 @@ def VGGFace2_FaceID_dr_get_datasets(data, load_train=True, load_test=True, img_s
ai8x.normalize(args=args)])

test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='identification_dr',
transform=test_transform, img_size=img_size, args=args)
transform=test_transform, img_size=img_size)

print(f'Test dataset length: {len(test_dataset)}\n')
else:
Expand All @@ -387,7 +395,7 @@ def VGGFace2_Facedet_get_datasets(data, load_train=True, load_test=True, img_siz
])

train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='detection',
transform=train_transform, img_size=img_size, args=args)
transform=train_transform, img_size=img_size)

print(f'Train dataset length: {len(train_dataset)}\n')
else:
Expand All @@ -397,7 +405,7 @@ def VGGFace2_Facedet_get_datasets(data, load_train=True, load_test=True, img_siz
test_transform = transforms.Compose([ai8x.normalize(args=args)])

test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='detection',
transform=test_transform, img_size=img_size, args=args)
transform=test_transform, img_size=img_size)

print(f'Test dataset length: {len(test_dataset)}\n')
else:
Expand Down
39 changes: 24 additions & 15 deletions docs/FacialRecognitionSystem.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Facial Recognition System

This document aims to explain facial recognition applications for MAX7800x series microcontrollers. Facial recognition task consists from three main parts, face detection, face identification and dot product.

- The face detection model detects faces in the captured image and extracts a rectangular sub-image containing only one face.
Expand All @@ -9,48 +10,53 @@ Figure 1 depicts the facial recognition system sequential diagram.

<img src="facialrecognition.png" style="zoom: 50%;" />

Figure 1. MAX7800x facial recognition system
​Figure 1. MAX7800x facial recognition system

## Dataset
The first step will be the dataset preparation. The dataset is VGGFace-2 [1].
Please review the term and conditions at https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/. Then, download the dataset and extract raw images to the train and test subfolders.

The first step will be the dataset preparation. The dataset is VGGFace-2 [1].
Please review the term and conditions at [VGGFace2](https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/). Then, download the dataset and extract raw images to the train and test subfolders.

Expected folder structure:

- root_dir
- VGGFace-2
- train
- VGGFace-2
- train
- test

FaceID and Face Detection tasks share the same ground truth pickle, and it will be automatically generated when one of these tasks started.

## Face Detection

To be able to localize faces in a facial recognition system, a face detection algorithm is generally used in facial recognition systems. Face detection is an object detection problem that has various solutions in the literature. In this work, a face detection algorithm that will run on MAX7800x series microcontrollers with a real-time performance was targeted.

For digit detection problem, previously, a Tiny SSD[2] based MAX7800x object detection algorithm was developed , named Tinier SSD. The face detection model is a modified version of the digit detection model. The modification was realized to reduce the number of parameters and enable larger input size.

To train the facedetection model, "scripts/train_facedet_tinierssd.sh" script can be used.
To train the facedetection model, "scripts/train_facedet_tinierssd.sh" script can be used.

## FaceID
To train a faceID model for MAX7800x microcontrollers, there are multiple steps. As the MAX7800x FaceID models will be trained in a knowledge distillation fashion, the first step will be downloading a backbone checkpoint for the teacher model.

To train a faceID model for MAX7800x microcontrollers, there are multiple steps. As the MAX7800x FaceID models will be trained in a knowledge distillation fashion, the first step will be downloading a backbone checkpoint for the teacher model.

The suggested teacher model is IR-152, but the other teacher models defined in "model_irse_DRL.py" may be used as well. Please review the terms and conditions at face.evoLVe[3] repository, and download the checkpoint according to your teacher model selection.

There are two FaceID models, one for the MAX78000 and one for the MAX78002. The MAX78000 one is named faceid_112, and it is a relatively lightweight model. To enable more performance on MAX78002, a more complex model was developed, which is named mobilefacenet_112. To train the FaceID models, "scripts/train_faceid_112.sh" and "scripts/train_mobilefacenet_112.sh" scripts can be used. Training scripts will realize Dimensionality Reduction and Relation Based-Knowledge Knowledge Distillation steps automatically. A summary of Dimensionality Reduction and Relation-Based Knowledge Distillation can be found in the following sub-sections.
There are two FaceID models, one for the MAX78000 and one for the MAX78002. The MAX78000 one is named faceid_112, and it is a relatively lightweight model. To enable more performance on MAX78002, a more complex model was developed, which is named mobilefacenet_112. To train the FaceID models, "scripts/train_faceid_112.sh" and "scripts/train_mobilefacenet_112.sh" scripts can be used. Training scripts will realize Dimensionality Reduction and Relation Based-Knowledge Knowledge Distillation steps automatically. A summary of Dimensionality Reduction and Relation-Based Knowledge Distillation can be found in the following sub-sections.

### Dimensionality Reduction on the Teacher Model

Reducing embedding dimensionality can greatly reduce the post-processing operations and memory usage for the facial recognition system. To achieve this, the teacher backbone will be frozen and two additional Conv1d layers will be added to two teacher models. These additions are called dimension reduction layers. The suggested dimensionality for embeddings is 64, but other choices like 32, 128, and 256 can be tried. A summary of the dimensionality reduction is shown in Figure 2, and dimension reduction layers' details are represented in Table 1.


<img src="dimensionreductionlayers.png" style="zoom: 30%;" />

Figure 2. Dimensionality Reduction
​Figure 2. Dimensionality Reduction



Table 1. Dimension Reduction Layers

| Layer1 | Layer2 |
|--------------------------------------| ------------------------------------|
|--------------------------------------| ------------------------------------|
| Conv1d(In=512ch, Out=512Ch, Kernel=1)| Conv1d(In=512ch, Out=64Ch, Kernel=1)|
| BatchNorm1d(512) | BatchNorm1d(64) |
| PReLU(512) | |
Expand All @@ -61,29 +67,32 @@ To train dimensionality reduction layers Sub-Center ArcFace loss is used. The Su

<img src="SubCenterArcFaceLoss.png" style="zoom: 100%;" />

Figure 3. Sub-Center ArcFace Loss[4]
​Figure 3. Sub-Center ArcFace Loss[4]

### Relation-Based Knowledge Distillation

The knowledge distillation choice for the FaceID models was a relation-based one. The distillation loss was calculated as the MSE between teacher model and student model. Generally total loss for a knowledge distillation can be calculated as in the following equation.

*Total Loss = Distillation Loss * Distillation Weight + Student Loss * Student Weight*
Total Loss = Distillation Loss * Distillation Weight + Student Loss * Student Weight

To train the student FaceID models, no student loss was used, so student weight was set to 0.
To train the student FaceID models, no student loss was used, so student weight was set to 0.

From Figure 4, a visual can be seen for the relation-based knowledge distillation.

<img src="RelationBasedKD.png" style="zoom: 100%;" />

Figure 4. Relation-Based Knowledge Distillation[5]
​Figure 4. Relation-Based Knowledge Distillation[5]



## Dot Product Layer

The dot product layer weights will be populated with the embeddings that are generated by MAX7800x FaceID models. Outputs of the FaceID models are normalized at both inference and recording. Therefore, the result of the dot product layer equals cosine similarity. Using the cosine similarity as a distance metric, the image is identified as either one of the known subjects or 'Unknown' depending on the embedding distances. To record new people in the database, there are two options. The first one is using the Python scripts that are available on the SDK demos. The second option is to use "record on hardware" mode which does not require any external connection. The second option is not available for all platforms, so please check SDK demo ReadMEs to see if it is supported.



## References

[1] [Cao, Qiong, et al. "Vggface2: A dataset for recognising faces across pose and age." 2018 13th IEEE international conference on automatic face & gesture recognition (FG 2018). IEEE, 2018.](https://arxiv.org/abs/1710.08092)

[2] [A. Womg, M. J. Shafiee, F. Li and B. Chwyl, "Tiny SSD: A Tiny Single-Shot Detection Deep Convolutional Neural Network for Real-Time Embedded Object Detection," 2018 15th Conference on Computer and Robot Vision (CRV), Toronto, ON, Canada, 2018, pp. 95-101, doi: 10.1109/CRV.2018.00023.](https://ieeexplore.ieee.org/document/8575741)
Expand All @@ -92,4 +101,4 @@ From Figure 4, a visual can be seen for the relation-based knowledge distillatio

[4] [Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition." Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2019.](https://arxiv.org/abs/1801.07698)

[5] [Gou, Jianping, et al. "Knowledge distillation: A survey." International Journal of Computer Vision 129 (2021): 1789-1819.](https://arxiv.org/abs/2006.05525)
[5] [Gou, Jianping, et al. "Knowledge distillation: A survey." International Journal of Computer Vision 129 (2021): 1789-1819.](https://arxiv.org/abs/2006.05525)
2 changes: 1 addition & 1 deletion policies/qat_policy_faceid_112.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ overrides:
feature_stage.4.0.conv2:
weight_bits: 2
linear:
weight_bits: 8
weight_bits: 8

0 comments on commit 9109fd0

Please sign in to comment.