Skip to content

Commit

Permalink
Adicionar notebook, relatorio e csv do ano de 2012
Browse files Browse the repository at this point in the history
  • Loading branch information
CarolinaTozzi committed Oct 16, 2018
1 parent 0df80f9 commit c0de8f1
Show file tree
Hide file tree
Showing 3 changed files with 103,426 additions and 0 deletions.
221 changes: 221 additions & 0 deletions inep-curso-superior-2012-dask.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9565483\n"
]
}
],
"source": [
"import dask as dask\n",
"import dask.dataframe as dd\n",
"\n",
"\n",
"#add csv to dataframe\n",
"dataframe = dd.read_csv('DM_ALUNO.CSV', delimiter='|', encoding='ISO-8859-1', assume_missing=True, usecols=['CO_COR_RACA_ALUNO',\n",
" 'IN_SEXO_ALUNO',\n",
" 'NU_ANO_ALUNO_NASC',\n",
" 'IN_RESERVA_VAGAS',\n",
" 'IN_FINANC_ESTUDANTIL',\n",
" 'IN_RESERVA_ETNICO',\n",
" 'IN_ING_VESTIBULAR',\n",
" 'IN_ING_ENEM',\n",
" 'IN_ING_OUTRO_TIPO_SELECAO',\n",
" 'IN_ING_CONVENIO_PECG',\n",
" 'IN_ING_OUTRA_FORMA',\n",
" 'IN_RESERVA_ETNICO',\n",
" 'IN_RESERVA_DEFICIENCIA',\n",
" 'IN_RESERVA_ENSINO_PUBLICO',\n",
" 'IN_RESERVA_RENDA_FAMILIAR',\n",
" 'IN_RESERVA_OUTROS',\n",
" 'IN_FIN_REEMB_FIES',\n",
" 'IN_FIN_REEMB_ESTADUAL',\n",
" 'IN_FIN_REEMB_MUNICIPAL',\n",
" 'IN_FIN_REEMB_PROG_IES',\n",
" 'IN_FIN_REEMB_ENT_EXTERNA',\n",
" 'IN_FIN_REEMB_OUTRA',\n",
" 'IN_FIN_NAOREEMB_PROUNI_INTEGR',\n",
" 'IN_FIN_NAOREEMB_PROUNI_PARCIAL',\n",
" 'IN_FIN_NAOREEMB_ESTADUAL',\n",
" 'IN_FIN_NAOREEMB_MUNICIPAL',\n",
" 'IN_FIN_NAOREEMB_PROG_IES',\n",
" 'IN_FIN_NAOREEMB_ENT_EXTERNA',\n",
" 'ANO_INGRESSO']); \n",
"\n",
"# print(len(dataframe))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"95658\n"
]
}
],
"source": [
"# generate sample from original dataframe\n",
"sample = dataframe.sample(frac=0.01);\n",
"# print(len(sample))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" CO_COR_RACA_ALUNO IN_SEXO_ALUNO NU_ANO_ALUNO_NASC IN_RESERVA_VAGAS \\\n",
"50527 6.0 1.0 1972.0 0.0 \n",
"22416 0.0 1.0 1994.0 0.0 \n",
"54050 1.0 0.0 1982.0 0.0 \n",
"37050 0.0 1.0 1986.0 0.0 \n",
"76654 1.0 0.0 1987.0 0.0 \n",
"\n",
" IN_FINANC_ESTUDANTIL IN_ING_VESTIBULAR IN_ING_ENEM \\\n",
"50527 0.0 1.0 0.0 \n",
"22416 1.0 1.0 1.0 \n",
"54050 0.0 1.0 0.0 \n",
"37050 1.0 1.0 0.0 \n",
"76654 0.0 1.0 0.0 \n",
"\n",
" IN_ING_OUTRO_TIPO_SELECAO IN_ING_CONVENIO_PECG IN_ING_OUTRA_FORMA \\\n",
"50527 0.0 0.0 0.0 \n",
"22416 0.0 0.0 0.0 \n",
"54050 0.0 0.0 0.0 \n",
"37050 0.0 0.0 0.0 \n",
"76654 0.0 NaN 0.0 \n",
"\n",
" ... IN_FIN_REEMB_PROG_IES IN_FIN_REEMB_ENT_EXTERNA \\\n",
"50527 ... NaN NaN \n",
"22416 ... 0.0 0.0 \n",
"54050 ... NaN NaN \n",
"37050 ... 0.0 0.0 \n",
"76654 ... NaN NaN \n",
"\n",
" IN_FIN_REEMB_OUTRA IN_FIN_NAOREEMB_PROUNI_INTEGR \\\n",
"50527 NaN NaN \n",
"22416 0.0 0.0 \n",
"54050 NaN NaN \n",
"37050 0.0 0.0 \n",
"76654 NaN NaN \n",
"\n",
" IN_FIN_NAOREEMB_PROUNI_PARCIAL IN_FIN_NAOREEMB_ESTADUAL \\\n",
"50527 NaN NaN \n",
"22416 0.0 0.0 \n",
"54050 NaN NaN \n",
"37050 0.0 0.0 \n",
"76654 NaN NaN \n",
"\n",
" IN_FIN_NAOREEMB_MUNICIPAL IN_FIN_NAOREEMB_PROG_IES \\\n",
"50527 NaN NaN \n",
"22416 1.0 1.0 \n",
"54050 NaN NaN \n",
"37050 0.0 1.0 \n",
"76654 NaN NaN \n",
"\n",
" IN_FIN_NAOREEMB_ENT_EXTERNA ANO_INGRESSO \n",
"50527 NaN 2012.0 \n",
"22416 0.0 2012.0 \n",
"54050 NaN 2011.0 \n",
"37050 0.0 2008.0 \n",
"76654 NaN 2010.0 \n",
"\n",
"[5 rows x 28 columns]\n"
]
}
],
"source": [
"#check sample head\n",
"print(sample.head())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import pandas_profiling as pf"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"#generate report\n",
"report_2012 = pf.ProfileReport(sample.compute());"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"#export report to html\n",
"report_2012.to_file(outputfile=\"report_2012.html\");"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "to_csv() got an unexpected keyword argument 'delimiter'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-24-e56bb1a5934b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#export sample to csv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sample_2012.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelimiter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'|'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ISO-8859-1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: to_csv() got an unexpected keyword argument 'delimiter'"
]
}
],
"source": [
"#export sample to csv\n",
"sample.compute().to_csv('sample_2012.csv');"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit c0de8f1

Please sign in to comment.