Skip to content

Commit 4e5fcb8

Browse files
committed
docs: update figures
1 parent e2e3f8e commit 4e5fcb8

File tree

3 files changed

+274
-30
lines changed

3 files changed

+274
-30
lines changed
27.8 KB
Loading

docs/notebooks/analysis.ipynb

+274-30
Original file line numberDiff line numberDiff line change
@@ -942,19 +942,19 @@
942942
},
943943
"data": [
944944
{
945-
"hovertemplate": "iteration=%{x}<br>difference rate=%{y}<extra></extra>",
946-
"legendgroup": "",
945+
"hovertemplate": "variable=between iterations i and i-1<br>iteration=%{x}<br>value=%{y}<extra></extra>",
946+
"legendgroup": "between iterations i and i-1",
947947
"line": {
948-
"color": "red",
948+
"color": "darkred",
949949
"dash": "solid"
950950
},
951951
"marker": {
952952
"symbol": "circle"
953953
},
954954
"mode": "lines+markers",
955-
"name": "",
955+
"name": "between iterations i and i-1",
956956
"orientation": "v",
957-
"showlegend": false,
957+
"showlegend": true,
958958
"type": "scatter",
959959
"x": [
960960
"1",
@@ -1056,10 +1056,245 @@
10561056
0
10571057
],
10581058
"yaxis": "y"
1059+
},
1060+
{
1061+
"hovertemplate": "variable=between iterations i and i-2<br>iteration=%{x}<br>value=%{y}<extra></extra>",
1062+
"legendgroup": "between iterations i and i-2",
1063+
"line": {
1064+
"color": "red",
1065+
"dash": "solid"
1066+
},
1067+
"marker": {
1068+
"symbol": "circle"
1069+
},
1070+
"mode": "lines+markers",
1071+
"name": "between iterations i and i-2",
1072+
"orientation": "v",
1073+
"showlegend": true,
1074+
"type": "scatter",
1075+
"x": [
1076+
"1",
1077+
"2",
1078+
"3",
1079+
"4",
1080+
"5",
1081+
"6",
1082+
"7",
1083+
"8",
1084+
"9",
1085+
"10",
1086+
"11",
1087+
"12",
1088+
"13",
1089+
"14",
1090+
"15",
1091+
"16",
1092+
"17",
1093+
"18",
1094+
"19",
1095+
"20",
1096+
"21",
1097+
"22",
1098+
"23",
1099+
"24",
1100+
"25",
1101+
"26",
1102+
"27",
1103+
"28",
1104+
"29",
1105+
"30",
1106+
"31",
1107+
"32",
1108+
"33",
1109+
"34",
1110+
"35",
1111+
"36",
1112+
"37",
1113+
"38",
1114+
"39",
1115+
"40",
1116+
"41",
1117+
"42",
1118+
"43",
1119+
"44",
1120+
"45",
1121+
"46",
1122+
"47"
1123+
],
1124+
"xaxis": "x",
1125+
"y": [
1126+
null,
1127+
0.3744703436493627,
1128+
0.2584919679400187,
1129+
0.2575624869773936,
1130+
0.3354539416279705,
1131+
0.4188757144837745,
1132+
0.3815398854872195,
1133+
0.37252935425033307,
1134+
0.373612939871794,
1135+
0.38583659389981584,
1136+
0.32693083499488307,
1137+
0.1754692310941527,
1138+
0.1576660256716269,
1139+
0.13324454007608433,
1140+
0.09771999771283357,
1141+
0.05219683729363822,
1142+
0.015599375694597128,
1143+
0.011972664216328588,
1144+
0.016040515903046004,
1145+
0.01899735951541015,
1146+
0.01929476219023485,
1147+
0.01679836591874828,
1148+
0.01786593993372232,
1149+
0.0048451869560888206,
1150+
0.019296108665599387,
1151+
0.015241922704512079,
1152+
0.006375064795644425,
1153+
0.014807977571616893,
1154+
0,
1155+
0.02972815215239899,
1156+
0.031101702438839918,
1157+
0.04622963124273283,
1158+
0,
1159+
0,
1160+
0,
1161+
0,
1162+
0,
1163+
0,
1164+
0,
1165+
0,
1166+
0,
1167+
0,
1168+
0,
1169+
0,
1170+
0,
1171+
0,
1172+
0
1173+
],
1174+
"yaxis": "y"
1175+
},
1176+
{
1177+
"hovertemplate": "variable=between iterations i and i-3<br>iteration=%{x}<br>value=%{y}<extra></extra>",
1178+
"legendgroup": "between iterations i and i-3",
1179+
"line": {
1180+
"color": "orange",
1181+
"dash": "solid"
1182+
},
1183+
"marker": {
1184+
"symbol": "circle"
1185+
},
1186+
"mode": "lines+markers",
1187+
"name": "between iterations i and i-3",
1188+
"orientation": "v",
1189+
"showlegend": true,
1190+
"type": "scatter",
1191+
"x": [
1192+
"1",
1193+
"2",
1194+
"3",
1195+
"4",
1196+
"5",
1197+
"6",
1198+
"7",
1199+
"8",
1200+
"9",
1201+
"10",
1202+
"11",
1203+
"12",
1204+
"13",
1205+
"14",
1206+
"15",
1207+
"16",
1208+
"17",
1209+
"18",
1210+
"19",
1211+
"20",
1212+
"21",
1213+
"22",
1214+
"23",
1215+
"24",
1216+
"25",
1217+
"26",
1218+
"27",
1219+
"28",
1220+
"29",
1221+
"30",
1222+
"31",
1223+
"32",
1224+
"33",
1225+
"34",
1226+
"35",
1227+
"36",
1228+
"37",
1229+
"38",
1230+
"39",
1231+
"40",
1232+
"41",
1233+
"42",
1234+
"43",
1235+
"44",
1236+
"45",
1237+
"46",
1238+
"47"
1239+
],
1240+
"xaxis": "x",
1241+
"y": [
1242+
null,
1243+
null,
1244+
0.4185488083500617,
1245+
0.32059274695978623,
1246+
0.38188407108973343,
1247+
0.4328001083276046,
1248+
0.43809523893493696,
1249+
0.44863341493748243,
1250+
0.42583584322701484,
1251+
0.43306719119120696,
1252+
0.43232359253714303,
1253+
0.2672080333181822,
1254+
0.24271630327842586,
1255+
0.19890150358864345,
1256+
0.13612013446745996,
1257+
0.11116121611697616,
1258+
0.05219683729363822,
1259+
0.027471411522205114,
1260+
0.016040515903046004,
1261+
0.026412277852278176,
1262+
0.01529331105371401,
1263+
0.006371557221197532,
1264+
0.02157985109054683,
1265+
0.01786593993372232,
1266+
0.02411961744620883,
1267+
0.015241922704512079,
1268+
0.013007750148461006,
1269+
0.01881557616407381,
1270+
0.00226498380677842,
1271+
0.017527343287245367,
1272+
0.042934433390037885,
1273+
0.031101702438839918,
1274+
0.04622963124273283,
1275+
0,
1276+
0,
1277+
0,
1278+
0,
1279+
0,
1280+
0,
1281+
0,
1282+
0,
1283+
0,
1284+
0,
1285+
0,
1286+
0,
1287+
0,
1288+
0
1289+
],
1290+
"yaxis": "y"
10591291
}
10601292
],
10611293
"layout": {
10621294
"legend": {
1295+
"title": {
1296+
"text": "variable"
1297+
},
10631298
"tracegroupgap": 0
10641299
},
10651300
"template": {
@@ -1902,7 +2137,7 @@
19022137
1
19032138
],
19042139
"title": {
1905-
"text": "difference rate"
2140+
"text": "difference (1.0-vmeasure)"
19062141
}
19072142
}
19082143
}
@@ -1914,41 +2149,50 @@
19142149
],
19152150
"source": [
19162151
"# Compute clustering difference score for each iteration.\n",
1917-
"clustering_differences: Dict[str, float] = {}\n",
2152+
"clustering_differences: Dict[str, Dict[int, float]] = {}\n",
19182153
"for iteration in range(1, int(df_status[\"iteration_id\"])):\n",
1919-
" # Get iteration ids.\n",
2154+
" # Get current iteration id.\n",
19202155
" current_iteration_id: str = str(iteration)\n",
1921-
" previous_iteration_id: str = str(iteration-1)\n",
1922-
" # Format clustering results: Get common text ids.\n",
1923-
" list_of_common_text_ids: List[str] = [\n",
1924-
" text_id\n",
1925-
" for text_id in df_clusterings.index\n",
1926-
" if (\n",
1927-
" df_texts[\"is_deleted\"][text_id] == False\n",
1928-
" ) and (\n",
1929-
" df_clusterings[previous_iteration_id][text_id] != \"-1\"\n",
1930-
" ) and (\n",
1931-
" df_clusterings[current_iteration_id][text_id] != \"-1\"\n",
2156+
" clustering_differences[current_iteration_id] = {}\n",
2157+
" # Analyze differences with previous iterations at several levels of depth. \n",
2158+
" for depth in [1, 2, 3]:\n",
2159+
" # Get previous iteration id.\n",
2160+
" if iteration<depth:\n",
2161+
" continue\n",
2162+
" previous_iteration_id: str = str(iteration-depth)\n",
2163+
" # Format clustering results: Get common text ids.\n",
2164+
" list_of_common_text_ids: List[str] = [\n",
2165+
" text_id\n",
2166+
" for text_id in df_clusterings.index\n",
2167+
" if (\n",
2168+
" df_texts[\"is_deleted\"][text_id] == False\n",
2169+
" ) and (\n",
2170+
" df_clusterings[previous_iteration_id][text_id] != \"-1\"\n",
2171+
" ) and (\n",
2172+
" df_clusterings[current_iteration_id][text_id] != \"-1\"\n",
2173+
" )\n",
2174+
" ]\n",
2175+
" # Compute scores.\n",
2176+
" clustering_differences[current_iteration_id][f\"between iterations i and i-{depth}\"] = 1.0 - metrics.v_measure_score(\n",
2177+
" labels_true=df_clusterings[previous_iteration_id][list_of_common_text_ids],\n",
2178+
" labels_pred=df_clusterings[current_iteration_id][list_of_common_text_ids],\n",
19322179
" )\n",
1933-
" ]\n",
1934-
" # Compute scores.\n",
1935-
" clustering_differences[current_iteration_id] = 1.0 - metrics.v_measure_score(\n",
1936-
" labels_true=df_clusterings[previous_iteration_id][list_of_common_text_ids],\n",
1937-
" labels_pred=df_clusterings[current_iteration_id][list_of_common_text_ids],\n",
1938-
" )\n",
1939-
"df_clustering_differences: pd.DataFrame = pd.DataFrame.from_dict(clustering_differences, orient=\"index\", columns=[\"difference rate\"])\n",
2180+
"df_clustering_differences: pd.DataFrame = pd.DataFrame.from_dict(clustering_differences, orient=\"index\")\n",
19402181
"df_clustering_differences[\"iteration\"] = df_clustering_differences.index\n",
19412182
"\n",
19422183
"# Display clustering difference scores.\n",
19432184
"fig = px.line(\n",
19442185
" df_clustering_differences,\n",
19452186
" x=\"iteration\",\n",
1946-
" y=\"difference rate\",\n",
2187+
" y=[\"between iterations i and i-1\", \"between iterations i and i-2\", \"between iterations i and i-3\"],\n",
19472188
" markers=True,\n",
19482189
" title=\"<b>Evolution of clustering results differences</b>\",\n",
1949-
" color_discrete_sequence=[\"red\"],\n",
2190+
" color_discrete_sequence=[\"darkred\", \"red\", \"orange\"],\n",
2191+
")\n",
2192+
"fig.update_layout(\n",
2193+
" yaxis_title=\"difference (1.0-vmeasure)\",\n",
2194+
" yaxis=dict(range=[0.0, 1.0])\n",
19502195
")\n",
1951-
"fig.update_layout(yaxis=dict(range=[0.0, 1.0]))\n",
19522196
"fig"
19532197
]
19542198
},
@@ -2532,7 +2776,7 @@
25322776
},
25332777
{
25342778
"cell_type": "code",
2535-
"execution_count": 24,
2779+
"execution_count": 22,
25362780
"id": "847a05d9",
25372781
"metadata": {},
25382782
"outputs": [
Loading

0 commit comments

Comments
 (0)