diff --git a/Muhammad_Shazil_Lhr (1).ipynb b/Muhammad_Shazil_Lhr (1).ipynb new file mode 100644 index 0000000..c74ea7d --- /dev/null +++ b/Muhammad_Shazil_Lhr (1).ipynb @@ -0,0 +1,443 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello World\n" + ] + } + ], + "source": [ + "print \"Hello World\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df=pd.read_csv('/resources/data/chronic_kidney_disease_updated.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age bp sg al su rbc pc pcc ba \\\n", + "0 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 48 80 1.020 1 0 ? normal notpresent notpresent \n", + "2 7 50 1.020 4 0 ? normal notpresent notpresent \n", + "3 62 80 1.010 2 3 normal normal notpresent notpresent \n", + "4 48 70 1.005 4 0 normal abnormal present notpresent \n", + "5 51 80 1.010 2 0 normal normal notpresent notpresent \n", + "6 60 90 1.015 3 0 ? ? notpresent notpresent \n", + "7 68 70 1.010 0 0 ? normal notpresent notpresent \n", + "8 24 ? 1.015 2 4 normal abnormal notpresent notpresent \n", + "9 52 100 1.015 3 0 normal abnormal present notpresent \n", + "10 53 90 1.020 2 0 abnormal abnormal present notpresent \n", + "11 50 60 1.010 2 4 ? abnormal present notpresent \n", + "12 63 70 1.010 3 0 abnormal abnormal present notpresent \n", + "13 68 70 1.015 3 1 ? normal present notpresent \n", + "14 68 70 ? ? ? ? ? notpresent notpresent \n", + "15 68 80 1.010 3 2 normal abnormal present present \n", + "16 40 80 1.015 3 0 ? normal notpresent notpresent \n", + "17 47 70 1.015 2 0 ? normal notpresent notpresent \n", + "18 47 80 ? ? ? ? ? notpresent notpresent \n", + "19 60 100 1.025 0 3 ? normal notpresent notpresent \n", + "20 62 60 1.015 1 0 ? abnormal present notpresent \n", + "21 61 80 1.015 2 0 abnormal abnormal notpresent notpresent \n", + "22 60 90 ? ? ? ? ? notpresent notpresent \n", + "23 48 80 1.025 4 0 normal abnormal notpresent notpresent \n", + "24 21 70 1.010 0 0 ? normal notpresent notpresent \n", + "25 42 100 1.015 4 0 normal abnormal notpresent present \n", + "26 61 60 1.025 0 0 ? normal notpresent notpresent \n", + "27 75 80 1.015 0 0 ? normal notpresent notpresent \n", + "28 69 70 1.010 3 4 normal abnormal notpresent notpresent \n", + "29 75 70 ? 1 3 ? ? notpresent notpresent \n", + ".. ... ... ... ... ... ... ... ... ... \n", + "371 69 70 1.020 0 0 normal normal notpresent notpresent \n", + "372 28 60 1.025 0 0 normal normal notpresent notpresent \n", + "373 72 60 1.020 0 0 normal normal notpresent notpresent \n", + "374 61 70 1.025 0 0 normal normal notpresent notpresent \n", + "375 79 80 1.025 0 0 normal normal notpresent notpresent \n", + "376 70 80 1.020 0 0 normal normal notpresent notpresent \n", + "377 58 70 1.025 0 0 normal normal notpresent notpresent \n", + "378 64 70 1.020 0 0 normal normal notpresent notpresent \n", + "379 71 60 1.025 0 0 normal normal notpresent notpresent \n", + "380 62 80 1.025 0 0 normal normal notpresent notpresent \n", + "381 59 60 1.020 0 0 normal normal notpresent notpresent \n", + "382 71 70 1.025 0 0 ? ? notpresent notpresent \n", + "383 48 80 1.025 0 0 normal normal notpresent notpresent \n", + "384 80 80 1.025 0 0 normal normal notpresent notpresent \n", + "385 57 60 1.020 0 0 normal normal notpresent notpresent \n", + "386 63 70 1.020 0 0 normal normal notpresent notpresent \n", + "387 46 70 1.025 0 0 normal normal notpresent notpresent \n", + "388 15 80 1.025 0 0 normal normal notpresent notpresent \n", + "389 51 80 1.020 0 0 normal normal notpresent notpresent \n", + "390 41 80 1.025 0 0 normal normal notpresent notpresent \n", + "391 52 80 1.025 0 0 normal normal notpresent notpresent \n", + "392 36 80 1.025 0 0 normal normal notpresent notpresent \n", + "393 57 80 1.020 0 0 normal normal notpresent notpresent \n", + "394 43 60 1.025 0 0 normal normal notpresent notpresent \n", + "395 50 80 1.020 0 0 normal normal notpresent notpresent \n", + "396 55 80 1.020 0 0 normal normal notpresent notpresent \n", + "397 42 70 1.025 0 0 normal normal notpresent notpresent \n", + "398 12 80 1.020 0 0 normal normal notpresent notpresent \n", + "399 17 60 1.025 0 0 normal normal notpresent notpresent \n", + "400 58 80 1.025 0 0 normal normal notpresent notpresent \n", + "\n", + " bgr ... pcv wbcc rbcc htn dm cad appet pe ane class \n", + "0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 121 ... 44 7800 5.2 yes yes no good no no ckd \n", + "2 ? ... 38 6000 ? no no no good no no ckd \n", + "3 423 ... 31 7500 ? no yes no poor no yes ckd \n", + "4 117 ... 32 6700 3.9 yes no no poor yes yes ckd \n", + "5 106 ... 35 7300 4.6 no no no good no no ckd \n", + "6 74 ... 39 7800 4.4 yes yes no good yes no ckd \n", + "7 100 ... 36 ? ? no no no good no no ckd \n", + "8 410 ... 44 6900 5 no yes no good yes no ckd \n", + "9 138 ... 33 9600 4.0 yes yes no good no yes ckd \n", + "10 70 ... 29 12100 3.7 yes yes no poor no yes ckd \n", + "11 490 ... 28 ? ? yes yes no good no yes ckd \n", + "12 380 ... 32 4500 3.8 yes yes no poor yes no ckd \n", + "13 208 ... 28 12200 3.4 yes yes yes poor yes no ckd \n", + "14 98 ... ? ? ? yes yes yes poor yes no ckd \n", + "15 157 ... 16 11000 2.6 yes yes yes poor yes no ckd \n", + "16 76 ... 24 3800 2.8 yes no no good no yes ckd \n", + "17 99 ... ? ? ? no no no good no no ckd \n", + "18 114 ... ? ? ? yes no no poor no no ckd \n", + "19 263 ... 37 11400 4.3 yes yes yes good no no ckd \n", + "20 100 ... 30 5300 3.7 yes no yes good no no ckd \n", + "21 173 ... 24 9200 3.2 yes yes yes poor yes yes ckd \n", + "22 ? ... 32 6200 3.6 yes yes yes good no no ckd \n", + "23 95 ... 32 6900 3.4 yes no no good no yes ckd \n", + "24 ? ... ? ? ? no no no poor no yes ckd \n", + "25 ? ... 39 8300 4.6 yes no no poor no no ckd \n", + "26 108 ... 29 8400 3.7 yes yes no good no yes ckd \n", + "27 156 ... 35 10300 4 yes yes no poor no no ckd \n", + "28 264 ... 37 9600 4.1 yes yes yes good yes no ckd \n", + "29 123 ... ? ? ? no yes no good no no ckd \n", + ".. ... ... ... ... ... ... ... ... ... ... ... ... \n", + "371 83 ... 50 9300 5.4 no no no good no no notckd \n", + "372 79 ... 51 6500 5.0 no no no good no no notckd \n", + "373 109 ... 52 10500 5.5 no no no good no no notckd \n", + "374 133 ... 47 9200 4.9 no no no good no no notckd \n", + "375 111 ... 40 8000 6.4 no no no good no no notckd \n", + "376 74 ... 48 9700 5.6 no no no good no no notckd \n", + "377 88 ... 53 9100 5.2 no no no good no no notckd \n", + "378 97 ... 49 6400 4.8 no no no good no no notckd \n", + "379 ? ... 42 7700 5.5 no no no good no no notckd \n", + "380 78 ... 50 5400 5.7 no no no good no no notckd \n", + "381 113 ... 54 6500 4.9 no no no good no no notckd \n", + "382 79 ... 40 5800 5.9 no no no good no no notckd \n", + "383 75 ... 51 6000 6.5 no no no good no no notckd \n", + "384 119 ... 49 5100 5.0 no no no good no no notckd \n", + "385 132 ... 42 11000 4.5 no no no good no no notckd \n", + "386 113 ... 52 8000 5.1 no no no good no no notckd \n", + "387 100 ... 43 5700 6.5 no no no good no no notckd \n", + "388 93 ... 50 6200 5.2 no no no good no no notckd \n", + "389 94 ... 46 9500 6.4 no no no good no no notckd \n", + "390 112 ... 52 7200 5.8 no no no good no no notckd \n", + "391 99 ... 52 6300 5.3 no no no good no no notckd \n", + "392 85 ... 44 5800 6.3 no no no good no no notckd \n", + "393 133 ... 46 6600 5.5 no no no good no no notckd \n", + "394 117 ... 54 7400 5.4 no no no good no no notckd \n", + "395 137 ... 45 9500 4.6 no no no good no no notckd \n", + "396 140 ... 47 6700 4.9 no no no good no no notckd \n", + "397 75 ... 54 7800 6.2 no no no good no no notckd \n", + "398 100 ... 49 6600 5.4 no no no good no no notckd \n", + "399 114 ... 51 7200 5.9 no no no good no no notckd \n", + "400 131 ... 53 6800 6.1 no no no good no no notckd \n", + "\n", + "[401 rows x 25 columns]\n" + ] + } + ], + "source": [ + "print df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']\n" + ] + } + ], + "source": [ + "print df.columns.values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 'yes' 'no' ' yes' '\\tno' '\\tyes' '?']\n" + ] + } + ], + "source": [ + "print df.dm.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 'yes' 'no' ' yes' '?']\n" + ] + } + ], + "source": [ + "df['dm'].replace(regex=True,inplace=True,to_replace=r'\\t',value=r'')\n", + "df.replace('0',np.nan)\n", + "print df.dm.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age bp sg al su rbc pc pcc ba bgr \\\n", + "0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 48 80 1.020 1 0 NaN normal notpresent notpresent 121 \n", + "2 7 50 1.020 4 0 NaN normal notpresent notpresent NaN \n", + "3 62 80 1.010 2 3 normal normal notpresent notpresent 423 \n", + "4 48 70 1.005 4 0 normal abnormal present notpresent 117 \n", + "\n", + " ... pcv wbcc rbcc htn dm cad appet pe ane class \n", + "0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 ... 44 7800 5.2 yes yes no good no no ckd \n", + "2 ... 38 6000 NaN no no no good no no ckd \n", + "3 ... 31 7500 NaN no yes no poor no yes ckd \n", + "4 ... 32 6700 3.9 yes no no poor yes yes ckd \n", + "\n", + "[5 rows x 25 columns]\n" + ] + } + ], + "source": [ + "df=df.replace(r'\\?+',np.nan,regex=True)\n", + "print df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 'yes' 'no' ' yes']\n" + ] + } + ], + "source": [ + "print df.dm.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "age float64\n", + "bp float64\n", + "sg object\n", + "al object\n", + "su object\n", + "rbc object\n", + "pc object\n", + "pcc object\n", + "ba object\n", + "bgr object\n", + "bu object\n", + "sc object\n", + "sod object\n", + "pot object\n", + "hemo object\n", + "pcv object\n", + "wbcc object\n", + "rbcc object\n", + "htn object\n", + "dm object\n", + "cad object\n", + "appet object\n", + "pe object\n", + "ane object\n", + "class object\n", + "dtype: object\n" + ] + } + ], + "source": [ + "df[['age','bp']]=df[['age','bp']].apply(pd.to_numeric)\n", + "print df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "normal 201\n", + "abnormal 47\n", + "Name: rbc, dtype: int64\n" + ] + } + ], + "source": [ + "print df['rbc'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "180.0\n" + ] + } + ], + "source": [ + "print df['bp'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.to_csv(\"/resources/data/chronic_kidney_disease_answered.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + }, + "widgets": { + "state": {}, + "version": "1.1.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}