Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Code #16

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
*.out
*.png
*.pdf
*stats*.p
*.zip


MANIFEST

# PyInstaller
Expand Down Expand Up @@ -128,3 +135,10 @@ dmypy.json

# Pyre type checker
.pyre/


eda/csv_explorer.ipynb
*.html
*.txt
*.mp3
*.p
67 changes: 67 additions & 0 deletions eda/frequent_wordcounts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import numpy as np
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import seaborn as sns
import nltk
from typing import Set, List, Dict
import functools
from collections import Counter
import csv
import pathlib
from tqdm import tqdm
from pathlib import Path

fw_path1 = Path('/mnt/disks/std2/data/generated/common_voice/frequent_words')
fw_path1 = Path('/mnt/disks/std3/compressed_cleaned3/generated/common_voice/frequent_words')
# fw_path2 = Path('/mnt/disks/std3/data/generated/common_voice/frequent_words')
langs = os.listdir(fw_path1)
# langs += os.listdir(fw_path2)

# new_langs = ['gn','ha']

# langs = [i for i in langs if i in new
lang_to_c = {}

for l in tqdm(langs):
c = Counter()
words1_path = fw_path1 / l / 'clips'
# words2_path = fw_path2 / l / 'clips'
if words1_path.is_dir():
words1 = os.listdir(words1_path)
else:
words1 = []
# if words2_path.is_dir():
# words2 = os.listdir(words2_path)
# else:
# words2 = []
# words1 = os.listdir()
# words2 = os.listdir(fw_path2 / l / 'clips')
# words = os.listdir(f"/mnt/disks/std750/data/frequent_words/{l}/clips/")
for w in tqdm(words1):
wavs = os.listdir(fw_path1 / l / 'clips' / w)
# wavs = glob.glob(f"/mnt/disks/std750/data/frequent_words/{l}/clips/{w}/*.wav")
c[w] = len(wavs)

# for w in tqdm(words2):
# wavs = os.listdir(fw_path2 / l / 'clips' / w)
# # wavs = glob.glob(f"/mnt/disks/std750/data/frequent_words/{l}/clips/{w}/*.wav")
# if w in c:
# c[w] += len(wavs)
# else:
# c[w] = len(wavs)
# c[w] += len(wavs)

lang_to_c[l] = c

import pickle
# prev = pickle.load(open('frequent_words_stats_std4.p','rb'))

# for k,v in lang_to_c.items():
# prev[k] = v


pickle.dump(lang_to_c, open('frequent_words_stats_std_final.p','wb'))

78 changes: 78 additions & 0 deletions eda/get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#%%

import numpy as np
import pandas as pd
import argparse
from argparse import ArgumentParser

from typing import Dict

def argparser():
"""
Argument parser
"""
parser = ArgumentParser()
parser.add_argument("--input_file", type=str, help="input file")
parser.add_argument("--output_file", type=str, help="output file")
args = parser.parse_args()

return args

def get_number_of_keywords(df: pd.DataFrame, min_num_of_extractions: int = 20) -> Dict[str, int]:
langs = np.unique(df['language'].values)
vals = {}
for l in langs:
tdf = df[df['language'] == l]
tdf = tdf[tdf['counts']>=min_num_of_extractions]
vals[l] = len(tdf)

return vals

def max_and_min_num_extraction_each_language(df: pd.DataFrame):
langs = np.unique(df['language'].values)
min, max = {}, {}
for l in langs:
max[l], min[l] = (df[df['language'] == l]['counts'].max(), df[df['language'] == l]['counts'].min())

return min, max

def get_avg_extractions(df: pd.DataFrame):
langs = np.unique(df['language'].values)
vals = {}
for l in langs:
tdf = df[df['language'] == l]
vals[l] = tdf['counts'].mean()

return vals
#%%
if __name__ == '__main__':
args = argparser()
# input_file = "../../data/csvs/new.csv"
df = pd.read_csv(args.input_file)

df = df[df.counts >= 5]
#%%
# vals0 = get_number_of_keywords(df, 0)
vals5 = get_number_of_keywords(df, 5)
vals20 = get_number_of_keywords(df, 20)
vals50 = get_number_of_keywords(df, 50)
vals100 = get_number_of_keywords(df, 100)
vals200 = get_number_of_keywords(df, 200)

# table = pd.DataFrame(dict(zip(['Language', 'Atleast 0 extractions', 'Atleast 5 extractions', 'Atleast 20 extractions', 'Atleast 50 extractions', 'Atleast 100 extractions', 'Atleast 200 extractions'], [np.unique(df['language'].values).tolist(), vals0.values(), vals5.values(), vals20.values(), vals50.values(), vals100.values(), vals200.values()])), index=vals5.keys())
table = pd.DataFrame(dict(zip(['Language', 'Atleast 5 extractions', 'Atleast 20 extractions', 'Atleast 50 extractions', 'Atleast 100 extractions', 'Atleast 200 extractions'], [np.unique(df['language'].values).tolist(), vals5.values(), vals20.values(), vals50.values(), vals100.values(), vals200.values()])), index=vals5.keys())

min, max = max_and_min_num_extraction_each_language(df)
avg = get_avg_extractions(df)

table['Minimum'] = min.values()
table['Maximum'] = max.values()
table['Average'] = avg.values()

# output_file = "../../data/csvs/new2.csv"

table.to_csv(args.output_file, index=False)


# print(vals)
# %%
137 changes: 137 additions & 0 deletions eda/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
import numpy as np

import collections
from collections import Counter

from tqdm import tqdm

import plotly.express as px

#fig =px.scatter(x=range(10), y=range(10))
#fig.write_html("path/to/file.html")

stats = pickle.load(open('frequent_words_stats_std2.p','rb'))
# wordlengths = {}

# for l in stats:
# wordlengths[l] = len(stats[l])
# wordlengths = dict(sorted(wordlengths.items(), key=lambda item: item[1], reverse=True))

# #plt.gcf().set_size_inches(70,15)
# #plt.bar(wordlengths.keys(), wordlengths.values())
# fig, ax = plt.subplots()
# fig.set_size_inches(30,20)
# #plt.gcf().set_size_inches(70,15)
# ax.bar(wordlengths.keys(), wordlengths.values())
# ax.set_xticklabels(wordlengths.keys(), rotation=70)
# ax.set_xlabel("Languages")
# ax.set_ylabel("Number of words")
# ax.set_title("Number of Words for various languages in Common Voice")
# #ax.bar_label(p1)
# plt.savefig('numwords.png')
# plt.cla()

# wordlengths = {}

# for l in stats:
# wordlengths[l] = sum([len(i) for i in stats[l].keys()]) / len(stats[l].keys())

# fig, ax = plt.subplots()
# fig.set_size_inches(30,20)
# #plt.gcf().set_size_inches(70,15)
# ax.bar(wordlengths.keys(), wordlengths.values())
# ax.set_xticklabels(wordlengths.keys(), rotation=70)
# ax.set_xlabel("Languages")
# ax.set_ylabel("Word Length")
# ax.set_title("Average Word Length for various languages in Common Voice")
# #ax.bar_label(p1)
# plt.savefig('wordlengths.png')
# plt.cla()

def plot_counts(counts, title, lang):
counts = Counter(counts)
data = [(i, counts[i][1]) for i in range(len(counts.most_common(500)))]
# df = pd.DataFrame([data, columns=["index", "counts"]])
#sns.barplot(x="counts", y="keyword", data=df).set_title(title)
#length
# print(len(np.arange(0,50)), len(counts.most_common(50)))
# print(counts.most_common(50))
fig = px.line(df, x='index',y='counts', color = lang)
#plt.plot(np.arange(0,50).tolist(), [i[1] for i in counts.most_common(50)], label = lang)
print([i[1] for i in counts.most_common(500)])
#plt.gcf().set_size_inches(15,70)
# plt.savefig(f"plots/words/{lang}.png")
# plt.cla()
return fig


data =[]
for l in tqdm(stats):
counts = stats[l]
counts = Counter(counts).most_common(500)
#counts
#print(counts)
datat = [(i, counts[i][1], l) for i in range(len(counts))]
data.extend(datat)

df = pd.DataFrame(data, columns=['index','counts','language'])

fig = px.line(df, x='index', y='counts', color='language')

#for l in tqdm(stats):
# fig = plot_counts(stats[l], f"{l} frequent words", l)
#plt.draw()
#plt.pause(0.0001)

fig.write_html('line.html')
#plt.legend()
#plt.savefig('line.png')
#

from tqdm import tqdm

def wordlengths(main_data):
languages = main_data['language'].unique()
plot_saves = "plots/wordlengths/"
for l in tqdm(languages):

sub_df = main_data[main_data['language']==l]
words = sub_df['word'].unique().tolist()

wordlengths = {}
for w in words:
if type(w) == str:
if len(w) in wordlengths:
wordlengths[len(w)] += 1
else:
wordlengths[len(w)] = 1

fig, ax = plt.subplots(figsize=(8,8))
ax.bar(wordlengths.keys(), wordlengths.values())
ax.set_xlabel("Word Length")
ax.set_ylabel("Number of Keywords")
ax.set_title(f"Number of Keywords v/s Word Length for {l}")
save_path = plot_saves + f"{l}.png"
fig.savefig(f"{save_path}")

def graph1():
plot_saves = "plots/wordlengths/"
from tqdm import tqdm
for l in tqdm(languages):

sub_df = main_data[main_data['language']==l]

plotdata = sub_df[['word','counts']].groupby('counts').count().reset_index()
# plotdata = sub_df_w_counts

fig, ax = plt.subplots(figsize=(8,8))
ax.bar(plotdata.counts, wordlengths.word)
ax.set_xlabel("Word Length")
ax.set_ylabel("Number of Keywords")
ax.set_title(f"Number of Keywords v/s Word Length for {l}")
save_path = plot_saves + f"{l}.png"
fig.savefig(f"{save_path}")
1 change: 1 addition & 0 deletions eda/plot2.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions eda/plot2_violin2.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions eda/plot2_violins.ipynb

Large diffs are not rendered by default.

Loading