7 strip out unused models (#8)

hugs7 · Aug 23, 2024 · 7dd72f3 · 7dd72f3
2 parents bc81ac0 + 544df18
commit 7dd72f3
Show file tree

Hide file tree

Showing 15 changed files with 74 additions and 473 deletions.
diff --git a/eye_tracking/constants.py b/eye_tracking/constants.py
diff --git a/eye_tracking/gaze/__main__.py b/eye_tracking/gaze/__main__.py
diff --git a/eye_tracking/gaze/common/face_model.py b/eye_tracking/gaze/common/face_model.py
@@ -84,20 +84,14 @@ def compute_3d_pose(self, face: Face) -> None:
         rot = face.head_pose_rot.as_matrix()  # Has units of radians
         face.model3d = self.LANDMARKS @ rot.T + face.head_position  # This is the 3D model of the face in world coordinates
 
-    def compute_face_eye_centers(self, face: Face, mode: str) -> None:
+    def compute_face_eye_centers(self, face: Face) -> None:
         """Compute the centers of the face and eyes.
 
-        In the case of MPIIFaceGaze, the face center is defined as the
+        The face center is defined as the
         average coordinates of the six points at the corners of both
-        eyes and the mouth. In the case of ETH-XGaze, it's defined as
-        the average coordinates of the six points at the corners of both
-        eyes and the nose. The eye centers are defined as the average
-        coordinates of the corners of each eye.
+        eyes and the mouth.
         """
-        if mode == "ETH-XGaze":
-            face.center = face.model3d[np.concatenate([self.REYE_INDICES, self.LEYE_INDICES, self.NOSE_INDICES])].mean(axis=0)
-        else:
-            face.center = face.model3d[np.concatenate([self.REYE_INDICES, self.LEYE_INDICES, self.MOUTH_INDICES])].mean(axis=0)
+        face.center = face.model3d[np.concatenate([self.REYE_INDICES, self.LEYE_INDICES, self.MOUTH_INDICES])].mean(axis=0)
 
         # Face centre is world coordinates in 3D with units metres relative to the camera
         face.reye.center = face.model3d[self.REYE_INDICES].mean(axis=0)

diff --git a/eye_tracking/gaze/gaze_detector.py b/eye_tracking/gaze/gaze_detector.py
@@ -271,18 +271,17 @@ def _draw_face_template_model(self, face: Face) -> None:
     def _display_normalized_image(self, face: Face) -> None:
         if not self.config.demo.display_on_screen:
             return
+
         if not self.show_normalized_image:
             return
-        if self.config.mode == "MPIIGaze":
-            reye = face.reye.normalized_image
-            leye = face.leye.normalized_image
-            normalized = np.hstack([reye, leye])
-        elif self.config.mode in ["MPIIFaceGaze", "ETH-XGaze"]:
-            normalized = face.normalized_image
-        else:
-            raise ValueError
+
+        reye = face.reye.normalized_image
+        leye = face.leye.normalized_image
+        normalized = np.hstack([reye, leye])
+
         if self.config.demo.use_camera:
             normalized = utils.flip_image(normalized)
+
         cv2.imshow("normalized", normalized)
 
     def _draw_gaze_vector(self, face: Face) -> None:
@@ -291,37 +290,30 @@ def _draw_gaze_vector(self, face: Face) -> None:
 
         length = self.config.demo.gaze_visualization_length
 
-        if self.config.mode == "MPIIGaze":
-            for key in [FacePartsName.REYE, FacePartsName.LEYE]:
-                eye = getattr(face, key.name.lower())
-                end_point = eye.center + length * eye.gaze_vector  # eye.gaze_vector.z is always -1. We scale by length
-                self.visualizer.draw_3d_line(eye.center, end_point)
+        for key in [FacePartsName.REYE, FacePartsName.LEYE]:
+            eye = getattr(face, key.name.lower())
+            end_point = eye.center + length * eye.gaze_vector  # eye.gaze_vector.z is always -1. We scale by length
+            self.visualizer.draw_3d_line(eye.center, end_point)
 
-                pitch, yaw = np.rad2deg(eye.vector_to_angle(eye.gaze_vector))
-                logger.info(f"[{key.name.lower()}] pitch: {pitch:.2f}, yaw: {yaw:.2f}")
+            pitch, yaw = np.rad2deg(eye.vector_to_angle(eye.gaze_vector))
+            logger.info(f"[{key.name.lower()}] pitch: {pitch:.2f}, yaw: {yaw:.2f}")
 
-            average_eye_distance = (face.reye.distance + face.leye.distance) / 2
-            average_eye_center = (face.reye.center + face.leye.center) / 2
-            average_gaze_vector = (face.reye.gaze_vector + face.leye.gaze_vector) / 2
+        average_eye_distance = (face.reye.distance + face.leye.distance) / 2
+        average_eye_center = (face.reye.center + face.leye.center) / 2
+        average_gaze_vector = (face.reye.gaze_vector + face.leye.gaze_vector) / 2
 
-            end_point = average_eye_center + length * average_gaze_vector
-            self.visualizer.draw_3d_line(average_eye_center, end_point)
+        end_point = average_eye_center + length * average_gaze_vector
+        self.visualizer.draw_3d_line(average_eye_center, end_point)
 
-            # Draw the point on the screen the user is looking at
-            point_on_screen = average_eye_center + (average_eye_distance * 0.9) * average_gaze_vector
-            point_on_screen[1] *= 0.15  # Scale y-coordinate
+        # Draw the point on the screen the user is looking at
+        point_on_screen = average_eye_center + (average_eye_distance * 0.9) * average_gaze_vector
+        point_on_screen[1] *= 0.15  # Scale y-coordinate
 
-            # Update buffer and calculate smoothed point
-            self.point_buffer.append(point_on_screen)
-            if len(self.point_buffer) > self.point_on_screen_smoothing_factor:
-                self.point_buffer.pop(0)  # Remove oldest point
+        # Update buffer and calculate smoothed point
+        self.point_buffer.append(point_on_screen)
+        if len(self.point_buffer) > self.point_on_screen_smoothing_factor:
+            self.point_buffer.pop(0)  # Remove oldest point
 
-            smoothed_point = np.mean(self.point_buffer, axis=0)
+        smoothed_point = np.mean(self.point_buffer, axis=0)
 
-            self.visualizer.draw_3d_points(np.array([smoothed_point]), color=(0, 255, 0), size=10, clamp_to_screen=True)
-        elif self.config.mode in ["MPIIFaceGaze", "ETH-XGaze"]:
-            self.visualizer.draw_3d_line(face.center, face.center + length * face.gaze_vector)
-            pitch, yaw = np.rad2deg(face.vector_to_angle(face.gaze_vector))
-            logger.info(f"[face] pitch: {pitch:.2f}, yaw: {yaw:.2f}")
-        else:
-            raise ValueError
+        self.visualizer.draw_3d_points(np.array([smoothed_point]), color=(0, 255, 0), size=10, clamp_to_screen=True)
diff --git a/eye_tracking/gaze/gaze_estimator.py b/eye_tracking/gaze/gaze_estimator.py
@@ -36,7 +36,7 @@ def __init__(self, config: DictConfig):
             self.camera, self._normalized_camera, self._config.gaze_estimator.normalized_camera_distance
         )
         self._gaze_estimation_model = self._load_model()
-        self._transform = create_transform(config)
+        self._transform = create_transform()
 
     def _load_model(self) -> torch.nn.Module:
         model = create_model(self._config)
@@ -55,36 +55,32 @@ def detect_faces_raw(self, image: np.ndarray) -> List[np.ndarray]:
     def estimate_gaze(self, image: np.ndarray, face: Face) -> None:
         self._face_model3d.estimate_head_pose(face, self.camera)
         self._face_model3d.compute_3d_pose(face)
-        self._face_model3d.compute_face_eye_centers(face, self._config.mode)
-
-        if self._config.mode == "MPIIGaze":
-            for key in self.EYE_KEYS:
-                eye = getattr(face, key.name.lower())
-                self._head_pose_normalizer.normalize(image, eye)
-            self._run_mpiigaze_model(face)
-        elif self._config.mode == "MPIIFaceGaze":
-            self._head_pose_normalizer.normalize(image, face)
-            self._run_mpiifacegaze_model(face)
-        elif self._config.mode == "ETH-XGaze":
-            self._head_pose_normalizer.normalize(image, face)
-            self._run_ethxgaze_model(face)
-        else:
-            raise ValueError
+        self._face_model3d.compute_face_eye_centers(face)
+
+        for key in self.EYE_KEYS:
+            eye = getattr(face, key.name.lower())
+            self._head_pose_normalizer.normalize(image, eye)
+
+        self._run_mpiigaze_model(face)
 
     @torch.no_grad()
     def _run_mpiigaze_model(self, face: Face) -> None:
         images = []
         head_poses = []
+
         for key in self.EYE_KEYS:
             eye = getattr(face, key.name.lower())
             image = eye.normalized_image
             normalized_head_pose = eye.normalized_head_rot2d
+
             if key == FacePartsName.REYE:
                 image = utils.flip_image(image).copy()
                 normalized_head_pose *= np.array([1, -1])
+
             image = self._transform(image)
             images.append(image)
             head_poses.append(normalized_head_pose)
+
         images = torch.stack(images)
         head_poses = np.array(head_poses).astype(np.float32)
         head_poses = torch.from_numpy(head_poses)
@@ -98,33 +94,9 @@ def _run_mpiigaze_model(self, face: Face) -> None:
         for i, key in enumerate(self.EYE_KEYS):
             eye = getattr(face, key.name.lower())
             eye.normalized_gaze_angles = predictions[i]
+
             if key == FacePartsName.REYE:
                 eye.normalized_gaze_angles *= np.array([1, -1])
+
             eye.angle_to_vector()
             eye.denormalize_gaze_vector()
-
-    @torch.no_grad()
-    def _run_mpiifacegaze_model(self, face: Face) -> None:
-        image = self._transform(face.normalized_image).unsqueeze(0)
-
-        device = torch.device(self._config.device)
-        image = image.to(device)
-        prediction = self._gaze_estimation_model(image)
-        prediction = prediction.cpu().numpy()
-
-        face.normalized_gaze_angles = prediction[0]
-        face.angle_to_vector()
-        face.denormalize_gaze_vector()
-
-    @torch.no_grad()
-    def _run_ethxgaze_model(self, face: Face) -> None:
-        image = self._transform(face.normalized_image).unsqueeze(0)
-
-        device = torch.device(self._config.device)
-        image = image.to(device)
-        prediction = self._gaze_estimation_model(image)
-        prediction = prediction.cpu().numpy()
-
-        face.normalized_gaze_angles = prediction[0]
-        face.angle_to_vector()
-        face.denormalize_gaze_vector()
diff --git a/eye_tracking/gaze/head_pose_estimation/face_landmark_estimator.py b/eye_tracking/gaze/head_pose_estimation/face_landmark_estimator.py
@@ -1,7 +1,5 @@
 from typing import List
 
-import face_alignment
-import face_alignment.detection.sfd
 import mediapipe
 import numpy as np
 from omegaconf import DictConfig
@@ -12,81 +10,21 @@
 class LandmarkEstimator:
     def __init__(self, config: DictConfig):
         self.mode = config.face_detector.mode
-        if self.mode == "dlib":
-            raise NotImplementedError("Dlib is not supported for landmark estimation")
-        elif self.mode == "face_alignment_dlib":
-            raise NotImplementedError("Dlib is not supported for landmark estimation")
-        elif self.mode == "face_alignment_sfd":
-            self.detector = face_alignment.detection.sfd.sfd_detector.SFDDetector(device=config.device)
-            self.predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False, device=config.device)
-        elif self.mode == "mediapipe":
-            self.detector = mediapipe.solutions.face_mesh.FaceMesh(
-                max_num_faces=config.face_detector.mediapipe_max_num_faces,
-                static_image_mode=config.face_detector.mediapipe_static_image_mode,
-                refine_landmarks=True,  # Adds eye pupil landmarks (468-477)
-            )
-        else:
-            raise ValueError
+        self.detector = mediapipe.solutions.face_mesh.FaceMesh(
+            max_num_faces=config.face_detector.mediapipe_max_num_faces,
+            static_image_mode=config.face_detector.mediapipe_static_image_mode,
+            refine_landmarks=True,  # Adds eye pupil landmarks (468-477)
+        )
 
     def detect_faces(self, image: np.ndarray) -> List[Face]:
-        if self.mode == "dlib":
-            return self._detect_faces_dlib(image)
-        elif self.mode == "face_alignment_dlib":
-            return self._detect_faces_face_alignment_dlib(image)
-        elif self.mode == "face_alignment_sfd":
-            return self._detect_faces_face_alignment_sfd(image)
-        elif self.mode == "mediapipe":
-            return self._detect_faces_mediapipe(image)
-        else:
-            raise ValueError
-
-    def detect_faces_raw(self, image: np.ndarray) -> List[np.ndarray]:
-        if self.mode == "mediapipe":
-            return self._detect_faces_mediapipe_raw(image)
-        else:
-            raise ValueError
-
-    def _detect_faces_dlib(self, image: np.ndarray) -> List[Face]:
-        bboxes = self.detector(self._get_bgr_frame(image), 0)
-        detected = []
-        for bbox in bboxes:
-            predictions = self.predictor(self._get_bgr_frame(image), bbox)
-            landmarks = np.array([(pt.x, pt.y) for pt in predictions.parts()], dtype=np.float32)
-            bbox = np.array([[bbox.left(), bbox.top()], [bbox.right(), bbox.bottom()]], dtype=np.float32)
-            detected.append(Face(bbox, landmarks))
-        return detected
-
-    def _detect_faces_face_alignment_dlib(self, image: np.ndarray) -> List[Face]:
-        bboxes = self.detector(self._get_bgr_frame(image), 0)
-        bboxes = [[bbox.left(), bbox.top(), bbox.right(), bbox.bottom()] for bbox in bboxes]
-        predictions = self.predictor.get_landmarks(self._get_bgr_frame(image), detected_faces=bboxes)
-        if predictions is None:
-            predictions = []
-        detected = []
-        for bbox, landmarks in zip(bboxes, predictions):
-            bbox = np.array(bbox, dtype=np.float32).reshape(2, 2)
-            detected.append(Face(bbox, landmarks))
-        return detected
-
-    def _detect_faces_face_alignment_sfd(self, image: np.ndarray) -> List[Face]:
-        bboxes = self.detector.detect_from_image(self._get_bgr_frame(image).copy())
-        bboxes = [bbox[:4] for bbox in bboxes]
-        predictions = self.predictor.get_landmarks(self._get_bgr_frame(image), detected_faces=bboxes)
-        if predictions is None:
-            predictions = []
-        detected = []
-        for bbox, landmarks in zip(bboxes, predictions):
-            bbox = np.array(bbox, dtype=np.float32).reshape(2, 2)
-            detected.append(Face(bbox, landmarks))
-        return detected
-
-    def _detect_faces_mediapipe(self, image: np.ndarray) -> List[Face]:
         """
         Calculated landmarks scaled to the image size with a bounding box
+        :param image: RGB image
+        :return: List of faces
         """
 
         h, w = image.shape[:2]
-        faces_landmarks = self._detect_faces_mediapipe_raw(image)
+        faces_landmarks = self._detect_faces_raw(image)
         detected = []
         if faces_landmarks:
             for face in faces_landmarks:
@@ -96,9 +34,17 @@ def _detect_faces_mediapipe(self, image: np.ndarray) -> List[Face]:
                 detected.append(Face(bbox, pts))
         return detected
 
-    def _detect_faces_mediapipe_raw(self, image: np.ndarray) -> List[np.ndarray]:
+    def detect_faces_raw(self, image: np.ndarray) -> List[np.ndarray]:
+        if self.mode == "mediapipe":
+            return self._detect_faces_raw(image)
+        else:
+            raise ValueError
+
+    def _detect_faces_raw(self, image: np.ndarray) -> List[np.ndarray]:
         """
-        Returns landmarks as they come from the mediapipe model
+        Returns landmarks as they come from the mediapipe model (not scaled to the image size)
+        :param image: RGB image
+        :return: List of faces landmarks
         """
         predictions = self.detector.process(self._get_bgr_frame(image))
         faces_landmarks = []