Summary

aimclub · Oct 10, 2024 · de0c642 · de0c642
1 parent 25c445d
commit de0c642
Show file tree

Hide file tree

Showing 7 changed files with 10,320 additions and 5,519 deletions.
diff --git a/docs/source/user_guide/notebooks/Audio-get_acoustic_features.ipynb b/docs/source/user_guide/notebooks/Audio-get_acoustic_features.ipynb
@@ -60,7 +60,7 @@
     {
      "data": {
       "text/markdown": [
-       "<span style=\"color:#333\">**[</span><span style=\"color:#1776D2\">2024-10-08 19:49:08</span><span style=\"color:#333\">]</span> <span style=\"color:#333\">OCEANAI - персональные качества личности человека:</span>**<br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Авторы:</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмина Елена [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмин Дмитрий [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Карпов Алексей [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Сопровождающие:</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмина Елена [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмин Дмитрий [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Версия: <u>1.0.0a40</u></span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Лицензия: <u>BSD License</u></span></p>"
+       "<span style=\"color:#333\">**[</span><span style=\"color:#1776D2\">2024-10-09 16:38:10</span><span style=\"color:#333\">]</span> <span style=\"color:#333\">OCEANAI - персональные качества личности человека:</span>**<br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Авторы:</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмина Елена [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмин Дмитрий [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Карпов Алексей [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Сопровождающие:</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмина Елена [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Рюмин Дмитрий [<u>[email protected]</u>]</span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Версия: <u>1.0.0a40</u></span><br /><span style=\"color:#333\">&nbsp;&nbsp;&nbsp;&nbsp;Лицензия: <u>BSD License</u></span></p>"
       ],
       "text/plain": [
        "<IPython.core.display.Markdown object>"
@@ -167,7 +167,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

diff --git a/docs/source/user_guide/notebooks/Pipeline_practical_task_1.ipynb b/docs/source/user_guide/notebooks/Pipeline_practical_task_1.ipynb
diff --git a/docs/source/user_guide/notebooks/Pipeline_practical_task_2.ipynb b/docs/source/user_guide/notebooks/Pipeline_practical_task_2.ipynb
diff --git a/docs/source/user_guide/notebooks/Pipeline_practical_task_3.ipynb b/docs/source/user_guide/notebooks/Pipeline_practical_task_3.ipynb
diff --git a/docs/source/user_guide/notebooks/Prediction-get_av_union_predictions.ipynb b/docs/source/user_guide/notebooks/Prediction-get_av_union_predictions.ipynb
diff --git a/oceanai/modules/core/core.py b/oceanai/modules/core/core.py
@@ -4161,7 +4161,6 @@ def _professional_match(
             df_files (pd.DataFrame): **DataFrame** c данными
             correlation_coefficients (pd.DataFrame): **DataFrame** c коэффициентами корреляции
             personality_type (str): Персональный тип по версии MBTI
-            col_name_ocean (str): Столбец с названиями персональных качеств личности человека
             threshold (float): Порог для оценок полярности качеств (например, интроверт < 0.55, экстраверт > 0.55)
             out (bool): Отображение
 
@@ -4204,7 +4203,10 @@ def _professional_match(
 
                     name_mbti = correlation_coefficients.columns[1:]
 
-                    need_type = self.dict_mbti[personality_type]
+                    if len(personality_type) != 4:
+                        need_type = self.dict_mbti[personality_type]
+                    else:
+                        need_type = personality_type
 
                     for path in range(len(self._df_files)):
                         curr_traits = self._df_files.iloc[path].values[1:]
@@ -4233,9 +4235,9 @@ def _professional_match(
                         by=["MBTI_Score"], ascending=False
                     )
 
-                    self._df_files_MBTI_job_match.index.name = self._keys_id
-                    self._df_files_MBTI_job_match.index += 1
-                    self._df_files_MBTI_job_match.index = self._df_files_MBTI_job_match.index.map(str)
+                    # self._df_files_MBTI_job_match.index.name = self._keys_id
+                    # self._df_files_MBTI_job_match.index += 1
+                    # self._df_files_MBTI_job_match.index = self._df_files_MBTI_job_match.index.map(str)
 
                 except Exception:
                     self._other_error(self._unknown_err, out=out)
@@ -4261,7 +4263,6 @@ def _colleague_personality_type_match(
             df_files (pd.DataFrame): **DataFrame** c данными
             correlation_coefficients (pd.DataFrame): **DataFrame** c коэффициентами корреляции
             target_scores (List[float]): Список оценок персональных качеств личности целевого человека
-            col_name_ocean (str): Столбец с названиями персональных качеств личности человека
             threshold (float): Порог для оценок полярности качеств (например, интроверт < 0.55, экстраверт > 0.55)
             out (bool): Отображение
 
@@ -4340,7 +4341,7 @@ def _colleague_personality_type_match(
                             ]
                         )
 
-                        match, _ = self._compatibility_percentage(target_personality_type, personality_type)
+                        match, _ = self._compatibility_percentage(target_personality_type, personality_type, curr_weights)
 
                         self._df_files_MBTI_colleague_match.loc[
                             str(path + 1),
@@ -4351,9 +4352,9 @@ def _colleague_personality_type_match(
                         by=["Match"], ascending=False
                     )
 
-                    self._df_files_MBTI_colleague_match.index.name = self._keys_id
-                    self._df_files_MBTI_colleague_match.index += 1
-                    self._df_files_MBTI_colleague_match.index = self._df_files_MBTI_colleague_match.index.map(str)
+                    # self._df_files_MBTI_colleague_match.index.name = self._keys_id
+                    # self._df_files_MBTI_colleague_match.index += 1
+                    # self._df_files_MBTI_colleague_match.index = self._df_files_MBTI_colleague_match.index.map(str)
 
                 except Exception:
                     self._other_error(self._unknown_err, out=out)
@@ -4382,7 +4383,6 @@ def _colleague_personality_desorders(
             correlation_coefficients_disorders (pd.DataFrame): **DataFrame** c коэффициентами корреляции для расстройств
             target_scores (List[float]): Список оценок персональных качеств личности целевого человека
             personality_desorder_number (int): Количество приоритетных расстройств
-            col_name_ocean (str): Столбец с названиями персональных качеств личности человека
             threshold (float): Порог для оценок полярности качеств (например, интроверт < 0.55, экстраверт > 0.55)
             out (bool): Отображение
 
@@ -4441,6 +4441,13 @@ def _colleague_personality_desorders(
 
                         curr_weights = np.sum(curr_traits_matrix, axis=0)
 
+                        personality_type = "".join(
+                            [
+                                (name_mbti[idx_type][1] if curr_weights[idx_type] <= 0 else name_mbti[idx_type][0])
+                                for idx_type in range(len(curr_weights))
+                            ]
+                        )
+
                         for idx_type in range(len(curr_weights)):
                             idx_curr_matrix = pd_matrix[:, idx_type]
                             if curr_weights[idx_type] < 0:
@@ -4459,19 +4466,19 @@ def _colleague_personality_desorders(
                         pd_matrix = np.sum(pd_matrix, axis=1)
 
                         idx_max_values = np.argsort(-np.asarray(pd_matrix))[:personality_desorder_number]
-                        desorders = name_pd[idx_max_values]
+                        desorders = [name_pd[i] + ' ({})'.format(np.round(pd_matrix[i], 3)) for i in idx_max_values]
 
                         self._df_files_MBTI_disorders.loc[
                             str(path + 1),
-                            name_mbti.tolist()
-                            + [("Disorder" + " {}").format(i + 1) for i in range(personality_desorder_number)],
+                            ["MBTI"]
+                            + ["Disorder {}".format(i + 1) for i in range(personality_desorder_number)],
                         ] = (
-                            curr_weights.tolist() + desorders.tolist()
+                            [personality_type] + desorders
                         )
 
-                    self._df_files_MBTI_disorders.index.name = self._keys_id
-                    self._df_files_MBTI_disorders.index += 1
-                    self._df_files_MBTI_disorders.index = self._df_files_MBTI_disorders.index.map(str)
+                    # self._df_files_MBTI_disorders.index.name = self._keys_id
+                    # self._df_files_MBTI_disorders.index += 1
+                    # self._df_files_MBTI_disorders.index = self._df_files_MBTI_disorders.index.map(str)
 
                 except Exception:
                     self._other_error(self._unknown_err, out=out)

diff --git a/oceanai/modules/lab/text.py b/oceanai/modules/lab/text.py
@@ -781,9 +781,7 @@ def __process_audio_and_extract_features(
             self._model_transcriptions = WhisperForConditionalGeneration.from_pretrained(self._path_to_transriber).to(
                 self._device
             )
-
-        if lang == self.__lang_traslate[0]:
-            self.__forced_decoder_ids = self._processor.get_decoder_prompt_ids(language=lang, task="transcribe")
+            self._model_transcriptions.config.forced_decoder_ids = None
 
         path_to_wav = os.path.join(str(Path(path).parent), Path(path).stem + "." + "wav")
 
@@ -794,71 +792,40 @@ def __process_audio_and_extract_features(
                 )
                 call_audio = subprocess.call(ff_audio, shell=True)
 
-                try:
-                    if call_audio == 1:
-                        raise OSError
-                except OSError:
+                if call_audio != 0:
                     self._other_error(self._unknown_err, last=last, out=out)
                     return np.empty([]), np.empty([])
-                except Exception:
-                    self._other_error(self._unknown_err, last=last, out=out)
-                    return np.empty([]), np.empty([])
-                else:
-                    wav, sr = torchaudio.load(path_to_wav)
-
-                    if wav.size(0) > 1:
-                        wav = wav.mean(dim=0, keepdim=True)
-
-                    if sr != 16000:
-                        transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-                        wav = transform(wav)
-                        sr = 16000
-
-                    wav = wav.squeeze(0)
-
-                    for start in range(0, len(wav), win):
-                        inputs = self._processor(wav[start : start + win], sampling_rate=16000, return_tensors="pt")
-                        input_features = inputs.input_features.to(self._device)
-                        if lang == self.__lang_traslate[0]:
-                            generated_ids = self._model_transcriptions.generate(
-                                input_features=input_features,
-                                forced_decoder_ids=self.__forced_decoder_ids,
-                                max_new_tokens=448,
-                            )
-                        elif lang == self.__lang_traslate[1]:
-                            generated_ids = self._model_transcriptions.generate(
-                                input_features=input_features, max_new_tokens=448
-                            )
-                        transcription = self._processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-                        self.__text_pred += transcription
 
-                    return self.__translate_and_extract_features(self.__text_pred, lang, show_text, last, out)
-        else:
-            wav, sr = torchaudio.load(path_to_wav)
+        wav, sr = torchaudio.load(path_to_wav)
 
-            if wav.size(0) > 1:
-                wav = wav.mean(dim=0, keepdim=True)
+        if wav.size(0) > 1:
+            wav = wav.mean(dim=0, keepdim=True)
 
-            if sr != 16000:
-                transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-                wav = transform(wav)
-                sr = 16000
+        if sr != 16000:
+            transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+            wav = transform(wav)
+            sr = 16000
 
-            wav = wav.squeeze(0)
+        wav = wav.squeeze(0)
 
-            for start in range(0, len(wav), win):
-                inputs = self._processor(wav[start : start + win], sampling_rate=16000, return_tensors="pt")
-                input_features = inputs.input_features.to(self._device)
-                if lang == self.__lang_traslate[0]:
-                    generated_ids = self._model_transcriptions.generate(
-                        input_features=input_features, forced_decoder_ids=self.__forced_decoder_ids
-                    )
-                elif lang == self.__lang_traslate[1]:
-                    generated_ids = self._model_transcriptions.generate(input_features=input_features)
-                transcription = self._processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-                self.__text_pred += transcription
+        for start in range(0, len(wav), win):
+            inputs = self._processor(wav[start : start + win], sampling_rate=16000, return_tensors="pt")
+            input_features = inputs.input_features.to(self._device)
+            if lang == self.__lang_traslate[0]:
+                generated_ids = self._model_transcriptions.generate(
+                    input_features=input_features,
+                )
+            elif lang == self.__lang_traslate[1]:
+                generated_ids = self._model_transcriptions.generate(
+                    input_features=input_features, language="en"
+                )
+            transcription = self._processor.batch_decode(generated_ids, skip_special_tokens=False)
+            transcription = re.findall(r'> ([^<>]+)', transcription[0])
+            self.__text_pred += transcription[0] + ' '
 
-            return self.__translate_and_extract_features(self.__text_pred, lang, show_text, last, out)
+        self.__text_pred = self.__text_pred.strip()
+
+        return self.__translate_and_extract_features(self.__text_pred, lang, show_text, last, out)
 
     def __load_text_model_b5(self, show_summary: bool = False, out: bool = True) -> Optional[nn.Module]:
         """Формирование нейросетевой архитектуры модели для получения оценок персональных качеств