|
942 | 942 | },
|
943 | 943 | "data": [
|
944 | 944 | {
|
945 |
| - "hovertemplate": "iteration=%{x}<br>difference rate=%{y}<extra></extra>", |
946 |
| - "legendgroup": "", |
| 945 | + "hovertemplate": "variable=between iterations i and i-1<br>iteration=%{x}<br>value=%{y}<extra></extra>", |
| 946 | + "legendgroup": "between iterations i and i-1", |
947 | 947 | "line": {
|
948 |
| - "color": "red", |
| 948 | + "color": "darkred", |
949 | 949 | "dash": "solid"
|
950 | 950 | },
|
951 | 951 | "marker": {
|
952 | 952 | "symbol": "circle"
|
953 | 953 | },
|
954 | 954 | "mode": "lines+markers",
|
955 |
| - "name": "", |
| 955 | + "name": "between iterations i and i-1", |
956 | 956 | "orientation": "v",
|
957 |
| - "showlegend": false, |
| 957 | + "showlegend": true, |
958 | 958 | "type": "scatter",
|
959 | 959 | "x": [
|
960 | 960 | "1",
|
|
1056 | 1056 | 0
|
1057 | 1057 | ],
|
1058 | 1058 | "yaxis": "y"
|
| 1059 | + }, |
| 1060 | + { |
| 1061 | + "hovertemplate": "variable=between iterations i and i-2<br>iteration=%{x}<br>value=%{y}<extra></extra>", |
| 1062 | + "legendgroup": "between iterations i and i-2", |
| 1063 | + "line": { |
| 1064 | + "color": "red", |
| 1065 | + "dash": "solid" |
| 1066 | + }, |
| 1067 | + "marker": { |
| 1068 | + "symbol": "circle" |
| 1069 | + }, |
| 1070 | + "mode": "lines+markers", |
| 1071 | + "name": "between iterations i and i-2", |
| 1072 | + "orientation": "v", |
| 1073 | + "showlegend": true, |
| 1074 | + "type": "scatter", |
| 1075 | + "x": [ |
| 1076 | + "1", |
| 1077 | + "2", |
| 1078 | + "3", |
| 1079 | + "4", |
| 1080 | + "5", |
| 1081 | + "6", |
| 1082 | + "7", |
| 1083 | + "8", |
| 1084 | + "9", |
| 1085 | + "10", |
| 1086 | + "11", |
| 1087 | + "12", |
| 1088 | + "13", |
| 1089 | + "14", |
| 1090 | + "15", |
| 1091 | + "16", |
| 1092 | + "17", |
| 1093 | + "18", |
| 1094 | + "19", |
| 1095 | + "20", |
| 1096 | + "21", |
| 1097 | + "22", |
| 1098 | + "23", |
| 1099 | + "24", |
| 1100 | + "25", |
| 1101 | + "26", |
| 1102 | + "27", |
| 1103 | + "28", |
| 1104 | + "29", |
| 1105 | + "30", |
| 1106 | + "31", |
| 1107 | + "32", |
| 1108 | + "33", |
| 1109 | + "34", |
| 1110 | + "35", |
| 1111 | + "36", |
| 1112 | + "37", |
| 1113 | + "38", |
| 1114 | + "39", |
| 1115 | + "40", |
| 1116 | + "41", |
| 1117 | + "42", |
| 1118 | + "43", |
| 1119 | + "44", |
| 1120 | + "45", |
| 1121 | + "46", |
| 1122 | + "47" |
| 1123 | + ], |
| 1124 | + "xaxis": "x", |
| 1125 | + "y": [ |
| 1126 | + null, |
| 1127 | + 0.3744703436493627, |
| 1128 | + 0.2584919679400187, |
| 1129 | + 0.2575624869773936, |
| 1130 | + 0.3354539416279705, |
| 1131 | + 0.4188757144837745, |
| 1132 | + 0.3815398854872195, |
| 1133 | + 0.37252935425033307, |
| 1134 | + 0.373612939871794, |
| 1135 | + 0.38583659389981584, |
| 1136 | + 0.32693083499488307, |
| 1137 | + 0.1754692310941527, |
| 1138 | + 0.1576660256716269, |
| 1139 | + 0.13324454007608433, |
| 1140 | + 0.09771999771283357, |
| 1141 | + 0.05219683729363822, |
| 1142 | + 0.015599375694597128, |
| 1143 | + 0.011972664216328588, |
| 1144 | + 0.016040515903046004, |
| 1145 | + 0.01899735951541015, |
| 1146 | + 0.01929476219023485, |
| 1147 | + 0.01679836591874828, |
| 1148 | + 0.01786593993372232, |
| 1149 | + 0.0048451869560888206, |
| 1150 | + 0.019296108665599387, |
| 1151 | + 0.015241922704512079, |
| 1152 | + 0.006375064795644425, |
| 1153 | + 0.014807977571616893, |
| 1154 | + 0, |
| 1155 | + 0.02972815215239899, |
| 1156 | + 0.031101702438839918, |
| 1157 | + 0.04622963124273283, |
| 1158 | + 0, |
| 1159 | + 0, |
| 1160 | + 0, |
| 1161 | + 0, |
| 1162 | + 0, |
| 1163 | + 0, |
| 1164 | + 0, |
| 1165 | + 0, |
| 1166 | + 0, |
| 1167 | + 0, |
| 1168 | + 0, |
| 1169 | + 0, |
| 1170 | + 0, |
| 1171 | + 0, |
| 1172 | + 0 |
| 1173 | + ], |
| 1174 | + "yaxis": "y" |
| 1175 | + }, |
| 1176 | + { |
| 1177 | + "hovertemplate": "variable=between iterations i and i-3<br>iteration=%{x}<br>value=%{y}<extra></extra>", |
| 1178 | + "legendgroup": "between iterations i and i-3", |
| 1179 | + "line": { |
| 1180 | + "color": "orange", |
| 1181 | + "dash": "solid" |
| 1182 | + }, |
| 1183 | + "marker": { |
| 1184 | + "symbol": "circle" |
| 1185 | + }, |
| 1186 | + "mode": "lines+markers", |
| 1187 | + "name": "between iterations i and i-3", |
| 1188 | + "orientation": "v", |
| 1189 | + "showlegend": true, |
| 1190 | + "type": "scatter", |
| 1191 | + "x": [ |
| 1192 | + "1", |
| 1193 | + "2", |
| 1194 | + "3", |
| 1195 | + "4", |
| 1196 | + "5", |
| 1197 | + "6", |
| 1198 | + "7", |
| 1199 | + "8", |
| 1200 | + "9", |
| 1201 | + "10", |
| 1202 | + "11", |
| 1203 | + "12", |
| 1204 | + "13", |
| 1205 | + "14", |
| 1206 | + "15", |
| 1207 | + "16", |
| 1208 | + "17", |
| 1209 | + "18", |
| 1210 | + "19", |
| 1211 | + "20", |
| 1212 | + "21", |
| 1213 | + "22", |
| 1214 | + "23", |
| 1215 | + "24", |
| 1216 | + "25", |
| 1217 | + "26", |
| 1218 | + "27", |
| 1219 | + "28", |
| 1220 | + "29", |
| 1221 | + "30", |
| 1222 | + "31", |
| 1223 | + "32", |
| 1224 | + "33", |
| 1225 | + "34", |
| 1226 | + "35", |
| 1227 | + "36", |
| 1228 | + "37", |
| 1229 | + "38", |
| 1230 | + "39", |
| 1231 | + "40", |
| 1232 | + "41", |
| 1233 | + "42", |
| 1234 | + "43", |
| 1235 | + "44", |
| 1236 | + "45", |
| 1237 | + "46", |
| 1238 | + "47" |
| 1239 | + ], |
| 1240 | + "xaxis": "x", |
| 1241 | + "y": [ |
| 1242 | + null, |
| 1243 | + null, |
| 1244 | + 0.4185488083500617, |
| 1245 | + 0.32059274695978623, |
| 1246 | + 0.38188407108973343, |
| 1247 | + 0.4328001083276046, |
| 1248 | + 0.43809523893493696, |
| 1249 | + 0.44863341493748243, |
| 1250 | + 0.42583584322701484, |
| 1251 | + 0.43306719119120696, |
| 1252 | + 0.43232359253714303, |
| 1253 | + 0.2672080333181822, |
| 1254 | + 0.24271630327842586, |
| 1255 | + 0.19890150358864345, |
| 1256 | + 0.13612013446745996, |
| 1257 | + 0.11116121611697616, |
| 1258 | + 0.05219683729363822, |
| 1259 | + 0.027471411522205114, |
| 1260 | + 0.016040515903046004, |
| 1261 | + 0.026412277852278176, |
| 1262 | + 0.01529331105371401, |
| 1263 | + 0.006371557221197532, |
| 1264 | + 0.02157985109054683, |
| 1265 | + 0.01786593993372232, |
| 1266 | + 0.02411961744620883, |
| 1267 | + 0.015241922704512079, |
| 1268 | + 0.013007750148461006, |
| 1269 | + 0.01881557616407381, |
| 1270 | + 0.00226498380677842, |
| 1271 | + 0.017527343287245367, |
| 1272 | + 0.042934433390037885, |
| 1273 | + 0.031101702438839918, |
| 1274 | + 0.04622963124273283, |
| 1275 | + 0, |
| 1276 | + 0, |
| 1277 | + 0, |
| 1278 | + 0, |
| 1279 | + 0, |
| 1280 | + 0, |
| 1281 | + 0, |
| 1282 | + 0, |
| 1283 | + 0, |
| 1284 | + 0, |
| 1285 | + 0, |
| 1286 | + 0, |
| 1287 | + 0, |
| 1288 | + 0 |
| 1289 | + ], |
| 1290 | + "yaxis": "y" |
1059 | 1291 | }
|
1060 | 1292 | ],
|
1061 | 1293 | "layout": {
|
1062 | 1294 | "legend": {
|
| 1295 | + "title": { |
| 1296 | + "text": "variable" |
| 1297 | + }, |
1063 | 1298 | "tracegroupgap": 0
|
1064 | 1299 | },
|
1065 | 1300 | "template": {
|
|
1902 | 2137 | 1
|
1903 | 2138 | ],
|
1904 | 2139 | "title": {
|
1905 |
| - "text": "difference rate" |
| 2140 | + "text": "difference (1.0-vmeasure)" |
1906 | 2141 | }
|
1907 | 2142 | }
|
1908 | 2143 | }
|
|
1914 | 2149 | ],
|
1915 | 2150 | "source": [
|
1916 | 2151 | "# Compute clustering difference score for each iteration.\n",
|
1917 |
| - "clustering_differences: Dict[str, float] = {}\n", |
| 2152 | + "clustering_differences: Dict[str, Dict[int, float]] = {}\n", |
1918 | 2153 | "for iteration in range(1, int(df_status[\"iteration_id\"])):\n",
|
1919 |
| - " # Get iteration ids.\n", |
| 2154 | + " # Get current iteration id.\n", |
1920 | 2155 | " current_iteration_id: str = str(iteration)\n",
|
1921 |
| - " previous_iteration_id: str = str(iteration-1)\n", |
1922 |
| - " # Format clustering results: Get common text ids.\n", |
1923 |
| - " list_of_common_text_ids: List[str] = [\n", |
1924 |
| - " text_id\n", |
1925 |
| - " for text_id in df_clusterings.index\n", |
1926 |
| - " if (\n", |
1927 |
| - " df_texts[\"is_deleted\"][text_id] == False\n", |
1928 |
| - " ) and (\n", |
1929 |
| - " df_clusterings[previous_iteration_id][text_id] != \"-1\"\n", |
1930 |
| - " ) and (\n", |
1931 |
| - " df_clusterings[current_iteration_id][text_id] != \"-1\"\n", |
| 2156 | + " clustering_differences[current_iteration_id] = {}\n", |
| 2157 | + " # Analyze differences with previous iterations at several levels of depth. \n", |
| 2158 | + " for depth in [1, 2, 3]:\n", |
| 2159 | + " # Get previous iteration id.\n", |
| 2160 | + " if iteration<depth:\n", |
| 2161 | + " continue\n", |
| 2162 | + " previous_iteration_id: str = str(iteration-depth)\n", |
| 2163 | + " # Format clustering results: Get common text ids.\n", |
| 2164 | + " list_of_common_text_ids: List[str] = [\n", |
| 2165 | + " text_id\n", |
| 2166 | + " for text_id in df_clusterings.index\n", |
| 2167 | + " if (\n", |
| 2168 | + " df_texts[\"is_deleted\"][text_id] == False\n", |
| 2169 | + " ) and (\n", |
| 2170 | + " df_clusterings[previous_iteration_id][text_id] != \"-1\"\n", |
| 2171 | + " ) and (\n", |
| 2172 | + " df_clusterings[current_iteration_id][text_id] != \"-1\"\n", |
| 2173 | + " )\n", |
| 2174 | + " ]\n", |
| 2175 | + " # Compute scores.\n", |
| 2176 | + " clustering_differences[current_iteration_id][f\"between iterations i and i-{depth}\"] = 1.0 - metrics.v_measure_score(\n", |
| 2177 | + " labels_true=df_clusterings[previous_iteration_id][list_of_common_text_ids],\n", |
| 2178 | + " labels_pred=df_clusterings[current_iteration_id][list_of_common_text_ids],\n", |
1932 | 2179 | " )\n",
|
1933 |
| - " ]\n", |
1934 |
| - " # Compute scores.\n", |
1935 |
| - " clustering_differences[current_iteration_id] = 1.0 - metrics.v_measure_score(\n", |
1936 |
| - " labels_true=df_clusterings[previous_iteration_id][list_of_common_text_ids],\n", |
1937 |
| - " labels_pred=df_clusterings[current_iteration_id][list_of_common_text_ids],\n", |
1938 |
| - " )\n", |
1939 |
| - "df_clustering_differences: pd.DataFrame = pd.DataFrame.from_dict(clustering_differences, orient=\"index\", columns=[\"difference rate\"])\n", |
| 2180 | + "df_clustering_differences: pd.DataFrame = pd.DataFrame.from_dict(clustering_differences, orient=\"index\")\n", |
1940 | 2181 | "df_clustering_differences[\"iteration\"] = df_clustering_differences.index\n",
|
1941 | 2182 | "\n",
|
1942 | 2183 | "# Display clustering difference scores.\n",
|
1943 | 2184 | "fig = px.line(\n",
|
1944 | 2185 | " df_clustering_differences,\n",
|
1945 | 2186 | " x=\"iteration\",\n",
|
1946 |
| - " y=\"difference rate\",\n", |
| 2187 | + " y=[\"between iterations i and i-1\", \"between iterations i and i-2\", \"between iterations i and i-3\"],\n", |
1947 | 2188 | " markers=True,\n",
|
1948 | 2189 | " title=\"<b>Evolution of clustering results differences</b>\",\n",
|
1949 |
| - " color_discrete_sequence=[\"red\"],\n", |
| 2190 | + " color_discrete_sequence=[\"darkred\", \"red\", \"orange\"],\n", |
| 2191 | + ")\n", |
| 2192 | + "fig.update_layout(\n", |
| 2193 | + " yaxis_title=\"difference (1.0-vmeasure)\",\n", |
| 2194 | + " yaxis=dict(range=[0.0, 1.0])\n", |
1950 | 2195 | ")\n",
|
1951 |
| - "fig.update_layout(yaxis=dict(range=[0.0, 1.0]))\n", |
1952 | 2196 | "fig"
|
1953 | 2197 | ]
|
1954 | 2198 | },
|
|
2532 | 2776 | },
|
2533 | 2777 | {
|
2534 | 2778 | "cell_type": "code",
|
2535 |
| - "execution_count": 24, |
| 2779 | + "execution_count": 22, |
2536 | 2780 | "id": "847a05d9",
|
2537 | 2781 | "metadata": {},
|
2538 | 2782 | "outputs": [
|
|
0 commit comments