-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit ee49174
Showing
8 changed files
with
308,660 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
nombre_de_phrases=300000 | ||
nombre_de_mots=7535392 | ||
taille_du_vocabulaire=308153 | ||
nombre_de_pos=52 | ||
longueur_mn_phrases=25.11797333333333 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
,0 | ||
68,418 | ||
21,9217 | ||
4,5173 | ||
23,8612 | ||
22,8912 | ||
8,7183 | ||
33,5387 | ||
20,9473 | ||
17,9946 | ||
50,1594 | ||
14,9740 | ||
28,6822 | ||
13,9278 | ||
35,4511 | ||
25,7931 | ||
47,1918 | ||
16,9850 | ||
48,1769 | ||
11,8631 | ||
12,8907 | ||
46,2034 | ||
64,566 | ||
15,9893 | ||
57,919 | ||
27,7278 | ||
9,7459 | ||
26,7716 | ||
34,4936 | ||
40,3197 | ||
18,9856 | ||
7,5854 | ||
29,6572 | ||
10,7827 | ||
39,3387 | ||
44,2467 | ||
19,9888 | ||
37,3957 | ||
24,8451 | ||
45,2346 | ||
6,5165 | ||
32,5541 | ||
38,3753 | ||
30,6366 | ||
5,5425 | ||
61,669 | ||
49,1637 | ||
36,4299 | ||
65,522 | ||
52,1266 | ||
58,835 | ||
31,6046 | ||
70,378 | ||
51,1419 | ||
54,1145 | ||
42,2787 | ||
41,2932 | ||
53,1245 | ||
67,431 | ||
62,638 | ||
43,2555 | ||
55,1067 | ||
59,814 | ||
66,483 | ||
63,601 | ||
60,749 | ||
56,944 | ||
69,413 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
moyenne des longueurs des phrases : 24.11797333333333 | ||
médiane des longueurs des phrases : 22.0 | ||
écart-type des longueurs des phrases : 13.526183453512454 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,260 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
### @authors: Anastasiia | ||
|
||
# =================================== | ||
# Bases Programmation - Mini Projet | ||
# =================================== | ||
|
||
# ========================= | ||
# 3 - Partie introductive | ||
# ========================= | ||
|
||
import csv | ||
import pandas as pd | ||
|
||
# lire le fichier .conll | ||
wikip_small = open("wikip_small.conll") | ||
wikip_small = csv.reader(wikip_small, delimiter="\t") | ||
|
||
wikip_small = pd.DataFrame(wikip_small) # en faire une dataframe | ||
|
||
### 1. Le nombre de phrases du corpus | ||
|
||
wikip_small_id = list(wikip_small.iloc[:,0]) # faire une liste à partir de la colonne 0 contenant les id des mots | ||
|
||
nb_phrases = 0 # compter le nombre de 1 dans cette liste car chaque phrase commence par un id 1 | ||
for i in wikip_small_id: | ||
if i == "1": | ||
nb_phrases += 1 | ||
print(nb_phrases) # il y a 300000 phrases | ||
|
||
### 2. Le nombre de mots du corpus | ||
|
||
wikip_small_mot = list(wikip_small.iloc[:,1]) # faire une liste à partir de la colonne 1 | ||
nb_mots = len(wikip_small_mot) | ||
print(nb_mots) # il y a 7535392 mots | ||
|
||
### 3. La taille du vocabulaire | ||
|
||
wikip_small_voc = set(wikip_small.iloc[:,1]) # créer un set à partir de la colonne 1 | ||
taille_voc = len(wikip_small_voc) | ||
print(taille_voc) # la taille du vocabulaire est 308153 | ||
|
||
### 4. Le nombre de PoS | ||
|
||
wikip_small_pos = set(wikip_small.iloc[:,3]) # créer un set à partir de la colonne 3 | ||
|
||
nb_pos = 0 | ||
for i in wikip_small_pos: | ||
nb_pos += 1 | ||
|
||
print(nb_pos) # il y a 52 parties du discours | ||
|
||
### 5. La longueur moyenne des phrases | ||
|
||
longueur_phrases = nb_mots / nb_phrases # diviser le nombre de mots par le nombre de phrases | ||
print(longueur_phrases) # la longueur moyenne des phrases est 25.11797333333333 | ||
|
||
### 6. Ecriture d'un fichier | ||
|
||
corpus_count = open("corpus_count.txt", "w") | ||
corpus_count.write("nombre_de_phrases=") | ||
corpus_count.write(str(nb_phrases) + "\n") | ||
corpus_count.write("nombre_de_mots=") | ||
corpus_count.write(str(nb_mots) + "\n") | ||
corpus_count.write("taille_du_vocabulaire=") | ||
corpus_count.write(str(taille_voc) + "\n") | ||
corpus_count.write("nombre_de_pos=") | ||
corpus_count.write(str(nb_pos) + "\n") | ||
corpus_count.write("longueur_mn_phrases=") | ||
corpus_count.write(str(longueur_phrases)) | ||
corpus_count.close() | ||
|
||
# ============================== | ||
# 4 - Partie avancée | ||
# ============================== | ||
# 4.1. Fréquence du vocabulaire | ||
# ============================== | ||
|
||
### 1. Fonction word_freq | ||
|
||
import csv | ||
import pandas as pd | ||
|
||
wikip_small = open("wikip_small.conll") | ||
wikip_small = csv.reader(wikip_small, delimiter="\t") | ||
wikip_small = pd.DataFrame(wikip_small) | ||
|
||
### 1. Ecrire la fonction word_freq | ||
|
||
def word_freq(file): | ||
""" | ||
Retourne un dictionnaire qui contient la fréquence des mots dans le corpus | ||
:param file: fichier au format conll | ||
:type file: df | ||
:return: dictionnaire de la fréquence des mots | ||
:rtype: dict | ||
""" | ||
wikip_small_mot = list(file.iloc[:,1]) | ||
|
||
freq = {} | ||
|
||
for mot in wikip_small_mot: | ||
if mot not in freq: | ||
freq[mot] = 1 | ||
else: | ||
freq[mot] += 1 | ||
|
||
return freq | ||
|
||
### 2. Stocker la fréquence des mots dans une DataFrame | ||
|
||
dict_mot_freq = word_freq(wikip_small) | ||
|
||
df_mot_freq = pd.DataFrame(dict_mot_freq, index=[0]) | ||
df_mot_freq = df_mot_freq.T | ||
|
||
### 3. Sauvegarder la DataFrame dans un fichier .csv | ||
|
||
df_mot_freq.to_csv("word_freq.csv") | ||
|
||
### 4. Trier les mots par leur fréquence (ordre décroissant), sauvegarder le résultat dans un fichier txt | ||
|
||
sorted_word_freq = open("sorted_word_freq.txt", "w") | ||
sorted_word_freq.write(str(df_mot_freq.sort_values(by=[0], ascending=False))) | ||
sorted_word_freq.close() | ||
|
||
### 5. Les cinq mots les plus fréquents | ||
|
||
print(df_mot_freq.sort_values(by=[0], ascending=False).head(5)) | ||
|
||
# 1: "," = 55224 | ||
# 2: "de" = 54091 | ||
# 3: "NaN" = 45330 | ||
# 4: "." = 44775 | ||
# 5: "la" = 26116 | ||
|
||
# ======================= | ||
# 4.2 Fréquence des PoS | ||
# ======================= | ||
|
||
### 1. Écrire la fonction pos_freq | ||
|
||
def pos_freq(file): | ||
""" | ||
Retourne un dictionnaire qui contient la fréquence des PoS dans le corpus | ||
:param file: fichier au format conll | ||
:type file: df | ||
:return: dictionnaire de la fréquence des PoS | ||
:rtype: dict | ||
""" | ||
wikip_small_pos = list(file.iloc[:,3]) | ||
|
||
freq = {} | ||
|
||
for pos in wikip_small_pos: | ||
if pos not in freq: | ||
freq[pos] = 1 | ||
else: | ||
freq[pos] += 1 | ||
|
||
return freq | ||
|
||
dict_pos_freq = pos_freq(wikip_small) | ||
|
||
### 2. Stocker la fréquence des PoS dans une DataFrame | ||
|
||
df_pos_freq = pd.DataFrame(dict_pos_freq, index=[0]) | ||
df_pos_freq = df_pos_freq.T | ||
|
||
### 3. Sauvegarder la DataFrame dans un fichier .csv | ||
|
||
df_pos_freq.to_csv("pos_freq.csv") | ||
|
||
### 4. Trier les PoS par leur fréquence (ordre décroissant) et sauvegarder le résultat dans un fichier .txt | ||
|
||
sorted_pos_freq = open("sorted_pos_freq.txt", "w") | ||
sorted_pos_freq.write(str(df_pos_freq.sort_values(by=[0], ascending=False))) | ||
sorted_pos_freq.close() | ||
|
||
### 5. PoS la plus fréquente | ||
|
||
print(df_pos_freq.sort_values(by=[0], ascending=False).head(1)) | ||
|
||
|
||
# ============================ | ||
# 4.3 - Longueurs des phrases | ||
# ============================ | ||
|
||
### 1. Écrire une fonction length_freq | ||
|
||
def length_freq(file): | ||
""" | ||
Retourne un dictionnaire qui contient la fréquence des longueurs des phrases dans le corpus | ||
:param file: fichier au format conll | ||
:type file: df | ||
:return: dictionnaire de la fréquence des longueurs des phrases | ||
:rtype: dict | ||
""" | ||
wikip_small_id = list(file.iloc[:,0]) | ||
|
||
taille_phrase = [] | ||
|
||
for i in range(len(wikip_small_id)): | ||
if wikip_small_id[i] == None: | ||
taille_phrase.append(wikip_small_id[i-1]) | ||
|
||
freq_taille_phrase = {} | ||
|
||
for i in taille_phrase: | ||
if i not in freq_taille_phrase: | ||
freq_taille_phrase[i] = 1 | ||
else: | ||
freq_taille_phrase[i] += 1 | ||
|
||
return freq_taille_phrase | ||
|
||
### 2. Stocker la fréquence des longueurs des phrases obtenues dans une DataFrame | ||
|
||
dict_taille_phrase = length_freq(wikip_small) | ||
|
||
df_taille_phrase = pd.DataFrame(dict_taille_phrase, index=[0]) | ||
df_taille_phrase = df_taille_phrase.T | ||
|
||
### 3. Sauvegarder la DataFrame dans un fichier .csv | ||
|
||
df_taille_phrase.to_csv("length_freq.csv") | ||
|
||
### 4. Calculer la moyenne, la médianne et l’écart-type des longueurs des phrases | ||
|
||
wikip_small_id = list(wikip_small.iloc[:,0]) | ||
|
||
taille_phrase = [] | ||
|
||
for i in range(len(wikip_small_id)): | ||
if wikip_small_id[i] == None: | ||
taille_phrase.append(wikip_small_id[i-1]) | ||
|
||
taille_phrase_int = [] | ||
for i in taille_phrase: | ||
taille_phrase_int.append(int(i)) | ||
|
||
df_taille_phrase_int = pd.DataFrame(taille_phrase_int) | ||
|
||
### 5. Sauvegarder ces résultats dans un fichier txt | ||
|
||
x = open("length_info.txt", "w") | ||
x.write("moyenne des longueurs des phrases : ") | ||
x.write(str(df_taille_phrase_int.mean()[0]) + "\n") | ||
x.write("médiane des longueurs des phrases : ") | ||
x.write(str(df_taille_phrase_int.median()[0]) + "\n") | ||
x.write("écart-type des longueurs des phrases : ") | ||
x.write(str(df_taille_phrase_int.std()[0]) + "\n") | ||
x.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
,0 | ||
CL,167719 | ||
V,707407 | ||
P,950555 | ||
D,901420 | ||
N,2158027 | ||
A,489132 | ||
PRO,92197 | ||
PONCT,1007998 | ||
ET,154653 | ||
ADV,190066 | ||
C,211978 | ||
,300000 | ||
P+D,202743 | ||
I,221 | ||
P+PRO,696 | ||
PREF,518 | ||
DET,4 | ||
s=w,2 | ||
XXe,6 | ||
岡本,1 | ||
バイオミラクル,1 | ||
XVe,3 | ||
VIIIe,1 | ||
9,1 | ||
NC,2 | ||
12,1 | ||
4,1 | ||
XVIIe,2 | ||
XVIIIe,4 | ||
北方領土,1 | ||
IIIe,1 | ||
Ve,1 | ||
CS,1 | ||
_,4 | ||
s=c,5 | ||
1,2 | ||
500,1 | ||
800,1 | ||
IXe,1 | ||
VIIe,1 | ||
XIe,2 | ||
s=s,1 | ||
XIIIe,2 | ||
IVe,1 | ||
Xe,1 | ||
Gus,1 | ||
XIIe,1 | ||
XIXe,1 | ||
mwehead=PRO,1 | ||
御門,1 | ||
日本の写真,1 | ||
789,1 |
Oops, something went wrong.