From 9109fd0f075e3b1667188c4637ec6d2759aac2d5 Mon Sep 17 00:00:00 2001
From: Oguzhan Buyuksolak <oguzhan.buyuksolak@analog.com>
Date: Thu, 7 Dec 2023 00:32:29 +0300
Subject: [PATCH] FaceID Part 2

---
 datasets/vggface2.py                | 52 +++++++++++++++++------------
 docs/FacialRecognitionSystem.md     | 39 +++++++++++++---------
 policies/qat_policy_faceid_112.yaml |  2 +-
 3 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/datasets/vggface2.py b/datasets/vggface2.py
index b69da585fd..a8b0ae8ef2 100644
--- a/datasets/vggface2.py
+++ b/datasets/vggface2.py
@@ -13,36 +13,40 @@
 
 import os
 import glob
-import torch
-import numpy as np
 import errno
+import pickle
+
+import numpy as np
 import cv2
 from tqdm import tqdm
 from PIL import Image
-import pickle
-
 
+import torch
+from torch.utils.data import Dataset
 from torchvision import transforms
 import torchvision.transforms.functional as FT
-import ai8x
-from torch.utils.data import Dataset
+
 from utils import augmentation_utils
 from skimage import transform as trans
 from hawk_eyes.face import RetinaFace
 import kornia.geometry.transform as GT
 
+import ai8x
+
 
 class VGGFace2(Dataset):
     """
     VGGFace2 Dataset
     """
-    def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=None, img_size=(112, 112), args=None):
+    def __init__(self, root_dir, d_type, mode, transform=None,
+                 teacher_transform=None, img_size=(112, 112)):
 
         if d_type not in ('test', 'train'):
             raise ValueError("d_type can only be set to 'test' or 'train'")
 
         if mode not in ('detection', 'identification', 'identification_dr'):
-            raise ValueError("mode can only be set to 'detection', 'identification', or 'identification_dr'")
+            raise ValueError("mode can only be set to 'detection', 'identification',"
+                             "or 'identification_dr'")
 
         self.root_dir = root_dir
         self.d_type = d_type
@@ -65,7 +69,8 @@ def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=Non
         self.__makedir_exist_ok(os.path.join(self.dataset_path, "processed"))
 
         if self.d_type in ('train', 'test'):
-            self.gt_path = os.path.join(self.dataset_path, "processed", self.d_type+"_vggface2.pickle")
+            self.gt_path = os.path.join(self.dataset_path, "processed",
+                                        self.d_type+"_vggface2.pickle")
             self.d_path = os.path.join(self.dataset_path, self.d_type)
             if not os.path.exists(self.gt_path):
                 assert os.path.isdir(self.d_path), (f'No dataset at {self.d_path}.\n'
@@ -81,7 +86,7 @@ def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=Non
                                                     '       - test \n')
 
                 print("Extracting ground truth from the " + self.d_type + " set")
-                self.__extract_gt(args)
+                self.__extract_gt()
 
             f = open(self.gt_path, 'rb')
             self.pickle_dict = pickle.load(f)
@@ -92,7 +97,7 @@ def __init__(self, root_dir, d_type, mode, transform=None, teacher_transform=Non
             return
 
 
-    def __extract_gt(self, args):
+    def __extract_gt(self):
         """
         Extracts the ground truth from the dataset
         """
@@ -146,10 +151,10 @@ def __getitem__(self, index):
         if self.mode == 'detection':
             return self.__getitem_detection(index)
 
-        elif self.mode == 'identification':
+        if self.mode == 'identification':
             return self.__getitem_identification(index)
 
-        elif self.mode == 'identification_dr':
+        if self.mode == 'identification_dr':
             return self.__getitem_identification_dr(index)
 
 
@@ -164,7 +169,8 @@ def __getitem_detection(self, index):
         boxes = self.pickle_dict["boxes"][index]
         boxes = torch.as_tensor(boxes, dtype=torch.float32)
 
-        img, boxes = augmentation_utils.resize(img, boxes, dims=(self.img_size[0], self.img_size[1]))
+        img, boxes = augmentation_utils.resize(img, boxes,
+                                               dims=(self.img_size[0], self.img_size[1]))
 
         labels = [1] * boxes.shape[0]
 
@@ -217,7 +223,8 @@ def __getitem_identification(self, index):
         box[3] = np.clip(box[3] + (max_dim - height) / 2, 0, img.shape[1])
 
         #Crop image with the square bounding box
-        img_C = FT.crop(img= img, top=int(box[1]), left=int(box[0]), height=int(box[3]-box[1]), width=int(box[2]-box[0]))
+        img_C = FT.crop(img= img, top=int(box[1]), left=int(box[0]),
+                        height=int(box[3]-box[1]), width=int(box[2]-box[0]))
 
         #Check if the cropped image is square, if not, pad it
         _, h, w = img_C.shape
@@ -308,7 +315,8 @@ def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size
     train_transform = transforms.Compose([
         transforms.ToTensor(),
         transforms.RandomHorizontalFlip(p=0.5),
-        transforms.ColorJitter(brightness=(0.6,1.4),saturation=(0.6,1.4),contrast=(0.6,1.4),hue=(-0.4,0.4)),
+        transforms.ColorJitter(brightness=(0.6,1.4),
+                               saturation=(0.6,1.4),contrast=(0.6,1.4),hue=(-0.4,0.4)),
         transforms.RandomErasing(p=0.1),
         ai8x.normalize(args=args)
     ])
@@ -321,7 +329,7 @@ def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size
 
         train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='identification',
                                 transform=train_transform, teacher_transform=teacher_transform,
-                                img_size=img_size, args=args)
+                                img_size=img_size)
 
         print(f'Train dataset length: {len(train_dataset)}\n')
     else:
@@ -333,7 +341,7 @@ def VGGFace2_FaceID_get_datasets(data, load_train=True, load_test=True, img_size
 
         test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='identification',
                                transform=test_transform, teacher_transform=teacher_transform,
-                               img_size=img_size, args=args)
+                               img_size=img_size)
 
         print(f'Test dataset length: {len(test_dataset)}\n')
     else:
@@ -356,7 +364,7 @@ def VGGFace2_FaceID_dr_get_datasets(data, load_train=True, load_test=True, img_s
     if load_train:
 
         train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='identification_dr',
-                                                      transform=train_transform, img_size=img_size, args=args)
+                                 transform=train_transform, img_size=img_size)
 
         print(f'Train dataset length: {len(train_dataset)}\n')
     else:
@@ -367,7 +375,7 @@ def VGGFace2_FaceID_dr_get_datasets(data, load_train=True, load_test=True, img_s
                                             ai8x.normalize(args=args)])
 
         test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='identification_dr',
-                                                     transform=test_transform, img_size=img_size, args=args)
+                                transform=test_transform, img_size=img_size)
 
         print(f'Test dataset length: {len(test_dataset)}\n')
     else:
@@ -387,7 +395,7 @@ def VGGFace2_Facedet_get_datasets(data, load_train=True, load_test=True, img_siz
         ])
 
         train_dataset = VGGFace2(root_dir=data_dir, d_type='train', mode='detection',
-                                                      transform=train_transform, img_size=img_size, args=args)
+                                 transform=train_transform, img_size=img_size)
 
         print(f'Train dataset length: {len(train_dataset)}\n')
     else:
@@ -397,7 +405,7 @@ def VGGFace2_Facedet_get_datasets(data, load_train=True, load_test=True, img_siz
         test_transform = transforms.Compose([ai8x.normalize(args=args)])
 
         test_dataset = VGGFace2(root_dir=data_dir, d_type='test', mode='detection',
-                                                     transform=test_transform, img_size=img_size, args=args)
+                                transform=test_transform, img_size=img_size)
 
         print(f'Test dataset length: {len(test_dataset)}\n')
     else:
diff --git a/docs/FacialRecognitionSystem.md b/docs/FacialRecognitionSystem.md
index f1182ed647..66ed5095e9 100644
--- a/docs/FacialRecognitionSystem.md
+++ b/docs/FacialRecognitionSystem.md
@@ -1,4 +1,5 @@
 # Facial Recognition System
+
 This document aims to explain facial recognition applications for MAX7800x series microcontrollers. Facial recognition task consists from three main parts, face detection, face identification and dot product.
 
     - The face detection model detects faces in the captured image and extracts a rectangular sub-image containing only one face.
@@ -9,48 +10,53 @@ Figure 1 depicts the facial recognition system sequential diagram.
 
 <img src="facialrecognition.png" style="zoom: 50%;" />
 
-​										Figure 1. MAX7800x facial recognition system
+​Figure 1. MAX7800x facial recognition system
 
 ## Dataset
-The first step will be the dataset preparation. The dataset is VGGFace-2 [1]. 
-Please review the term and conditions at https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/. Then, download the dataset and extract raw images to the train and test subfolders. 
+
+The first step will be the dataset preparation. The dataset is VGGFace-2 [1].
+Please review the term and conditions at [VGGFace2](https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/). Then, download the dataset and extract raw images to the train and test subfolders.
 
 Expected folder structure:
 
                             - root_dir
-                                - VGGFace-2                               
-                                    - train                              
+                                - VGGFace-2
+                                    - train
                                     - test
 
 FaceID and Face Detection tasks share the same ground truth pickle, and it will be automatically generated when one of these tasks started.
+
 ## Face Detection
+
 To be able to localize faces in a facial recognition system, a face detection algorithm is generally used in facial recognition systems. Face detection is an object detection problem that has various solutions in the literature. In this work, a face detection algorithm that will run on MAX7800x series microcontrollers with a real-time performance was targeted.
 
 For digit detection problem, previously, a Tiny SSD[2] based MAX7800x object detection algorithm was developed , named Tinier SSD. The face detection model is a modified version of the digit detection model. The modification was realized to reduce the number of parameters and enable larger input size.
 
-To train the facedetection model, "scripts/train_facedet_tinierssd.sh" script can be used. 
+To train the facedetection model, "scripts/train_facedet_tinierssd.sh" script can be used.
 
 ## FaceID
-To train a faceID model for MAX7800x microcontrollers, there are multiple steps. As the MAX7800x FaceID models will be trained in a knowledge distillation fashion, the first step will be downloading a backbone checkpoint for the teacher model. 
+
+To train a faceID model for MAX7800x microcontrollers, there are multiple steps. As the MAX7800x FaceID models will be trained in a knowledge distillation fashion, the first step will be downloading a backbone checkpoint for the teacher model.
 
 The suggested teacher model is IR-152, but the other teacher models defined in "model_irse_DRL.py" may be used as well. Please review the terms and conditions at face.evoLVe[3] repository, and download the checkpoint according to your teacher model selection.
 
-There are two FaceID models, one for the MAX78000 and one for the MAX78002. The MAX78000 one is named faceid_112, and it is a relatively lightweight model. To enable more performance on MAX78002, a more complex model was developed, which is named mobilefacenet_112. To train the FaceID models, "scripts/train_faceid_112.sh" and "scripts/train_mobilefacenet_112.sh" scripts can be used. Training scripts will realize Dimensionality Reduction and Relation Based-Knowledge Knowledge Distillation steps automatically. A summary of Dimensionality Reduction and Relation-Based Knowledge Distillation can be found in the following sub-sections.  
+There are two FaceID models, one for the MAX78000 and one for the MAX78002. The MAX78000 one is named faceid_112, and it is a relatively lightweight model. To enable more performance on MAX78002, a more complex model was developed, which is named mobilefacenet_112. To train the FaceID models, "scripts/train_faceid_112.sh" and "scripts/train_mobilefacenet_112.sh" scripts can be used. Training scripts will realize Dimensionality Reduction and Relation Based-Knowledge Knowledge Distillation steps automatically. A summary of Dimensionality Reduction and Relation-Based Knowledge Distillation can be found in the following sub-sections.
 
 ### Dimensionality Reduction on the Teacher Model
+
 Reducing embedding dimensionality can greatly reduce the post-processing operations and memory usage for the facial recognition system. To achieve this, the teacher backbone will be frozen and two additional Conv1d layers will be added to two teacher models. These additions are called dimension reduction layers. The suggested dimensionality for embeddings is 64, but other choices like 32, 128, and 256 can be tried. A summary of the dimensionality reduction is shown in Figure 2, and dimension reduction layers' details are represented in Table 1.
 
 
 <img src="dimensionreductionlayers.png" style="zoom: 30%;" />
 
-​										Figure 2. Dimensionality Reduction
+​Figure 2. Dimensionality Reduction
 
 
 
 Table 1. Dimension Reduction Layers
 
 | Layer1                               | Layer2                              |
-|--------------------------------------| ------------------------------------|   
+|--------------------------------------| ------------------------------------|
 | Conv1d(In=512ch, Out=512Ch, Kernel=1)| Conv1d(In=512ch, Out=64Ch, Kernel=1)|
 | BatchNorm1d(512)                     | BatchNorm1d(64)                     |
 | PReLU(512)                           |                                     |
@@ -61,29 +67,32 @@ To train dimensionality reduction layers Sub-Center ArcFace loss is used. The Su
 
 <img src="SubCenterArcFaceLoss.png" style="zoom: 100%;" />
 
-​									 Figure 3. Sub-Center ArcFace Loss[4]
+​Figure 3. Sub-Center ArcFace Loss[4]
 
 ### Relation-Based Knowledge Distillation
+
 The knowledge distillation choice for the FaceID models was a relation-based one. The distillation loss was calculated as the MSE between teacher model and student model. Generally total loss for a knowledge distillation can be calculated as in the following equation.
 
-*Total Loss = Distillation Loss * Distillation Weight + Student Loss * Student Weight*
+Total Loss = Distillation Loss * Distillation Weight + Student Loss * Student Weight
 
-To train the student FaceID models, no student loss was used, so student weight was set to 0.  
+To train the student FaceID models, no student loss was used, so student weight was set to 0.
 
 From Figure 4, a visual can be seen for the relation-based knowledge distillation.
 
 <img src="RelationBasedKD.png" style="zoom: 100%;" />
 
-​									 Figure 4. Relation-Based Knowledge Distillation[5]
+​Figure 4. Relation-Based Knowledge Distillation[5]
 
 
 
 ## Dot Product Layer
+
  The dot product layer weights will be populated with the embeddings that are generated by MAX7800x FaceID models. Outputs of the FaceID models are normalized at both inference and recording. Therefore, the result of the dot product layer equals cosine similarity. Using the cosine similarity as a distance metric, the image is identified as either one of the known subjects or 'Unknown' depending on the embedding distances. To record new people in the database, there are two options. The first one is using the Python scripts that are available on the SDK demos. The second option is to use "record on hardware" mode which does not require any external connection. The second option is not available for all platforms, so please check SDK demo ReadMEs to see if it is supported.
 
 
 
 ## References
+
 [1] [Cao, Qiong, et al. "Vggface2: A dataset for recognising faces across pose and age." 2018 13th IEEE international conference on automatic face & gesture recognition (FG 2018). IEEE, 2018.](https://arxiv.org/abs/1710.08092)
 
 [2] [A. Womg, M. J. Shafiee, F. Li and B. Chwyl, "Tiny SSD: A Tiny Single-Shot Detection Deep Convolutional Neural Network for Real-Time Embedded Object Detection," 2018 15th Conference on Computer and Robot Vision (CRV), Toronto, ON, Canada, 2018, pp. 95-101, doi: 10.1109/CRV.2018.00023.](https://ieeexplore.ieee.org/document/8575741)
@@ -92,4 +101,4 @@ From Figure 4, a visual can be seen for the relation-based knowledge distillatio
 
 [4] [Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition." Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2019.](https://arxiv.org/abs/1801.07698)
 
-[5] [Gou, Jianping, et al. "Knowledge distillation: A survey." International Journal of Computer Vision 129 (2021): 1789-1819.](https://arxiv.org/abs/2006.05525)
\ No newline at end of file
+[5] [Gou, Jianping, et al. "Knowledge distillation: A survey." International Journal of Computer Vision 129 (2021): 1789-1819.](https://arxiv.org/abs/2006.05525)
diff --git a/policies/qat_policy_faceid_112.yaml b/policies/qat_policy_faceid_112.yaml
index 197471912d..a40fdeaea5 100644
--- a/policies/qat_policy_faceid_112.yaml
+++ b/policies/qat_policy_faceid_112.yaml
@@ -14,4 +14,4 @@ overrides:
   feature_stage.4.0.conv2:
     weight_bits: 2
   linear:
-    weight_bits: 8
\ No newline at end of file
+    weight_bits: 8