tiyd-python-2015-01 · grissett83 · Feb 14, 2015 · Feb 14, 2015 · Feb 14, 2015 · Feb 14, 2015
diff --git a/ProgrammingLanguageClassification.ipynb b/ProgrammingLanguageClassification.ipynb
@@ -0,0 +1,300 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:61d5af4ccb86ac8537d317c915e8379d0ed4a12643ccdc5816b46ddc9097c3da"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import re\n",
+      "import numpy as np\n",
+      "from sklearn.metrics import (classification_report, f1_score, accuracy_score,\n",
+      "                             confusion_matrix)\n",
+      "import parser\n",
+      "import trainer\n",
+      "import predictor\n",
+      "from sklearn.ensemble import AdaBoostClassifier"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "heading",
+     "level": 2,
+     "metadata": {},
+     "source": [
+      "Programming Language Identification"
+     ]
+    },
+    {
+     "cell_type": "raw",
+     "metadata": {},
+     "source": [
+      "First, we need to create and train our language classifier.  This will also involve testing our classifier to see its accuracy.  For this script we are using a Random Tree Classifier provided by the sklearn toolkit."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "data, results = trainer.create_training_data()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "raw",
+     "metadata": {},
+     "source": [
+      "The create_training_data function reads in the training_data folder and parses and scores each of the source files for use with our classifier.  It also creates a list containing the correct answers for each of the elements in the data array.\n",
+      "\n",
+      "Next we need to split our data into training and testing blocks."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "train_data, test_data, train_results, test_results = trainer.split_data(data, results, 0.2)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "raw",
+     "metadata": {},
+     "source": [
+      "Now that our data has been appropriately split we need to use our training data to train our classifier."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "trained_forest = trainer.train_learner(train_data, train_results)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "raw",
+     "metadata": {},
+     "source": [
+      "Now that our random forest is trained, we need to run it against our test data to see how well it performs."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "trainer.test_learner(trained_forest, test_data, test_results)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "             precision    recall  f1-score   support\n",
+        "\n",
+        "    Clojure       1.00      1.00      1.00         6\n",
+        "    Haskell       1.00      0.50      0.67         2\n",
+        "       Java       1.00      1.00      1.00        39\n",
+        " JavaScript       1.00      1.00      1.00         9\n",
+        "      OCaml       1.00      1.00      1.00         4\n",
+        "        PHP       0.96      1.00      0.98        72\n",
+        "       Perl       1.00      0.91      0.95        23\n",
+        "     Python       0.94      1.00      0.97        15\n",
+        "       Ruby       1.00      0.93      0.96        29\n",
+        "      Scala       1.00      1.00      1.00        10\n",
+        "     Scheme       1.00      1.00      1.00         2\n",
+        "        TCL       0.83      1.00      0.91         5\n",
+        "\n",
+        "avg / total       0.98      0.98      0.98       216\n",
+        "\n",
+        "[[ 6  0  0  0  0  0  0  0  0  0  0  0]\n",
+        " [ 0  1  0  0  0  1  0  0  0  0  0  0]\n",
+        " [ 0  0 39  0  0  0  0  0  0  0  0  0]\n",
+        " [ 0  0  0  9  0  0  0  0  0  0  0  0]\n",
+        " [ 0  0  0  0  4  0  0  0  0  0  0  0]\n",
+        " [ 0  0  0  0  0 72  0  0  0  0  0  0]\n",
+        " [ 0  0  0  0  0  1 21  1  0  0  0  0]\n",
+        " [ 0  0  0  0  0  0  0 15  0  0  0  0]\n",
+        " [ 0  0  0  0  0  1  0  0 27  0  0  1]\n",
+        " [ 0  0  0  0  0  0  0  0  0 10  0  0]\n",
+        " [ 0  0  0  0  0  0  0  0  0  0  2  0]\n",
+        " [ 0  0  0  0  0  0  0  0  0  0  0  5]]\n",
+        "0.9761312978\n"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "raw",
+     "metadata": {},
+     "source": [
+      "We are getting near 97% accuracy against our test data.  TCL seems to have the lowest success in identification, but the training set for that language was very small.\n",
+      "\n",
+      "After training and testing the classifier was retrained using the entire data set and saved to disk for later use.  Next we will use our classifier to try to identify some other test data."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "result_list = []\n",
+      "with open(\"test.csv\") as result:\n",
+      "    results = result.readlines()\n",
+      "    for item in results:\n",
+      "        result_list.append(re.findall(\"\\d+,(\\w+)\", item)[0])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "result_list"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 7,
+       "text": [
+        "['Clojure',\n",
+        " 'Clojure',\n",
+        " 'Clojure',\n",
+        " 'Clojure',\n",
+        " 'Python',\n",
+        " 'Python',\n",
+        " 'Python',\n",
+        " 'Python',\n",
+        " 'JavaScript',\n",
+        " 'JavaScript',\n",
+        " 'JavaScript',\n",
+        " 'JavaScript',\n",
+        " 'Ruby',\n",
+        " 'Ruby',\n",
+        " 'Ruby',\n",
+        " 'Haskell',\n",
+        " 'Haskell',\n",
+        " 'Haskell',\n",
+        " 'Scheme',\n",
+        " 'Scheme',\n",
+        " 'Scheme',\n",
+        " 'Java',\n",
+        " 'Java',\n",
+        " 'Scala',\n",
+        " 'Scala',\n",
+        " 'TCL',\n",
+        " 'TCL',\n",
+        " 'PHP',\n",
+        " 'PHP',\n",
+        " 'PHP',\n",
+        " 'OCaml',\n",
+        " 'OCaml']"
+       ]
+      }
+     ],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "predictions = []\n",
+      "classifier = predictor.load_classifier()\n",
+      "\n",
+      "for num in range(1, 33):\n",
+      "    data = predictor.prepare_file(\"test/{}\".format(num))\n",
+      "    predictions.append(predictor.test_file(classifier, data))\n",
+      "predictions = np.array(predictions)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(classification_report(result_list, predictions))\n",
+      "print(confusion_matrix(result_list, predictions))\n",
+      "print(f1_score(result_list, predictions))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "             precision    recall  f1-score   support\n",
+        "\n",
+        "    Clojure       0.80      1.00      0.89         4\n",
+        "    Haskell       1.00      1.00      1.00         3\n",
+        "       Java       1.00      1.00      1.00         2\n",
+        " JavaScript       1.00      0.75      0.86         4\n",
+        "      OCaml       1.00      0.50      0.67         2\n",
+        "        PHP       0.75      1.00      0.86         3\n",
+        "     Python       1.00      1.00      1.00         4\n",
+        "       Ruby       0.75      1.00      0.86         3\n",
+        "      Scala       1.00      1.00      1.00         2\n",
+        "     Scheme       1.00      1.00      1.00         3\n",
+        "        TCL       1.00      0.50      0.67         2\n",
+        "\n",
+        "avg / total       0.93      0.91      0.90        32\n",
+        "\n",
+        "[[4 0 0 0 0 0 0 0 0 0 0]\n",
+        " [0 3 0 0 0 0 0 0 0 0 0]\n",
+        " [0 0 2 0 0 0 0 0 0 0 0]\n",
+        " [1 0 0 3 0 0 0 0 0 0 0]\n",
+        " [0 0 0 0 1 0 0 1 0 0 0]\n",
+        " [0 0 0 0 0 3 0 0 0 0 0]\n",
+        " [0 0 0 0 0 0 4 0 0 0 0]\n",
+        " [0 0 0 0 0 0 0 3 0 0 0]\n",
+        " [0 0 0 0 0 0 0 0 2 0 0]\n",
+        " [0 0 0 0 0 0 0 0 0 3 0]\n",
+        " [0 0 0 0 0 1 0 0 0 0 1]]\n",
+        "0.899801587302\n"
+       ]
+      }
+     ],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
diff --git a/__init__.py b/__init__.py