rust-ndarray · bytesnake · Oct 20, 2019 · Oct 20, 2019 · Oct 20, 2019 · Oct 20, 2019
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,4 +3,5 @@ members = [
   "./",
   "linear_regression",
   "k_means",
-]
+  "svm"
+]
diff --git a/svm/Cargo.toml b/svm/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "svm"
+version = "0.1.0"
+authors = ["Lorenz Schmidt <[email protected]>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ndarray = {version = "0.13"}
+ndarray-stats = "0.3"
+ndarray-rand = "0.11"
+rand = "0.7"
diff --git a/svm/src/lib.rs b/svm/src/lib.rs
@@ -0,0 +1,76 @@
+#![allow(non_snake_case)]
+use ndarray::{Array, Array1, ArrayBase, Data, Ix2};
+use ndarray_stats::DeviationExt;
+use std::iter::FromIterator;
+
+pub mod scores;
+
+pub struct SupportVectorMachine {
+    normal: Option<Array1<f64>>,
+    bias: Option<f64>
+}
+
+impl SupportVectorMachine {
+    pub fn new() -> SupportVectorMachine {
+        SupportVectorMachine {
+            normal: None, bias: None
+        }
+    }
+
+    pub fn fit<A>(&mut self, X: &ArrayBase<A, Ix2>, y_bool: &[bool])
+        where A: Data<Elem = f64>,
+    {
+        let (n_samples, _) = X.dim();
+        let y = Array::from_iter(y_bool.into_iter().map(|x| if *x { 1.0 } else { -1.0 }));
+
+        assert!(
+            n_samples == y.dim(),
+            "We need the same number of samples as well as targets!"
+        );
+
+        let mut multiplier = Array::ones(n_samples);
+
+        loop {
+            let tmp = (&y * &multiplier).dot(X);
+            let gamma = &X.dot(&tmp) * &y;
+
+            let update = 0.000000000001 * ( 1.0 - gamma );
+
+            let mut new_multiplier = &multiplier + &update;
+            new_multiplier.mapv_inplace(|x| f64::max(0.0, x));
+
+            let distance = multiplier.sq_l2_dist(&new_multiplier).unwrap();
+            multiplier = new_multiplier;
+
+            println!("{}", distance);
+
+            if distance < 1e-14 {
+                break;
+            }
+        }
+
+        let normal = (&y * &multiplier).dot(X);
+        let z = &X.dot(&normal);
+
+        // TODO pick min from positive and max from negative class
+        let min = z.iter().zip(y_bool.iter()).filter(|(_,y)| **y).map(|(x,_)| *x).fold(0./0., f64::min);
+        let max = z.iter().zip(y_bool.iter()).filter(|(_,y)| !**y).map(|(x,_)| *x).fold(0./0., f64::max);
+        let bias = (min - max) / 2.0;
+
+        self.normal = Some(normal);
+        self.bias = Some(bias);
+    }
+
+    pub fn predict<A>(&self, X: &ArrayBase<A, Ix2>) -> Array1<f64>
+        where A: Data<Elem = f64>
+    {
+        if let (Some(ref normal), Some(ref bias)) = (&self.normal, &self.bias) {
+            let mut estimate = X.dot(normal);
+            estimate.mapv_inplace(|x| x + bias);
+
+            estimate
+        } else {
+            Array::zeros(X.dim().1)
+        }
+    }
+}
diff --git a/svm/src/main.rs b/svm/src/main.rs
@@ -0,0 +1,54 @@
+#![allow(non_snake_case)]
+use std::path::Path;
+use std::fs::File;
+use std::io::{BufReader, BufRead};
+
+use ndarray::{Array, Array2, Axis};
+use ndarray_stats::QuantileExt;
+
+use svm::{SupportVectorMachine, scores};
+
+/// Read in dataset
+fn dataset<T: AsRef<Path> + Copy>(path: T) -> (Array2<f64>, Vec<bool>) {
+    let num_lines = BufReader::new(File::open(path).unwrap()).lines().count();
+    let mut x = Array::zeros((num_lines, 20));
+    let mut y = Vec::with_capacity(num_lines);
+
+    for (i1, line) in BufReader::new(File::open(path).unwrap()).lines().map(|x| x.unwrap()).enumerate() {
+        let mut iter = line.split(",").skip(1);
+        y.push(iter.next().unwrap() == "M");
+
+        for (i2, elm) in iter.map(|x| x.parse::<f64>().unwrap()).take(20).enumerate() {
+            x[(i1,i2)] = elm;
+        }
+    }
+
+    for mut col in x.axis_iter_mut(Axis(1)) {
+        let min = *col.min().unwrap();
+        let max = *col.max().unwrap();
+        col -= min;
+        col /= max - min;
+    }
+
+    (x, y)
+}
+
+fn main() {
+    let (X,y) = dataset("./src/wdbc.data");
+
+    let split_idx = (X.dim().0 as f64 * 0.9).floor() as usize;
+    let (training_x, testing_x) = X.view().split_at(Axis(0), split_idx);
+    let (training_y, testing_y) = y.split_at(split_idx);
+
+    let mut svm = SupportVectorMachine::new();
+    svm.fit(&training_x, &training_y);
+
+    // calculate precision
+    let prediction = svm.predict(&testing_x);
+
+    println!("Accuracy {}, Precision {}, Recall {}, F1 score {}", 
+             scores::accuracy(&prediction, &testing_y),
+             scores::precision(&prediction, &testing_y),
+             scores::recall(&prediction, &testing_y),
+             scores::f1_score(&prediction, &testing_y));
+}
diff --git a/svm/src/scores.rs b/svm/src/scores.rs
@@ -0,0 +1,45 @@
+use ndarray::prelude::*;
+use ndarray::Data;
+
+pub fn precision<D>(x: &ArrayBase<D, Ix1>, y: &[bool]) -> f64
+    where D: Data<Elem = f64> {
+    let num_positive = x.iter().filter(|a| **a > 0.0).count() as f64;
+    let num_true_positives = x.into_iter().zip(y.into_iter())
+        .filter(|(a,b)| **a > 0.0 && **b)
+        .count() as f64;
+
+    num_true_positives / num_positive
+}
+
+pub fn accuracy<D>(x: &ArrayBase<D, Ix1>, y: &[bool]) -> f64
+    where D: Data<Elem = f64> {
+
+    let num_correctly_classified = x.into_iter().zip(y.into_iter())
+        .filter(|(a,b)| **a > 0.0 && **b || **a <= 0.0 && !**b)
+        .count() as f64;
+
+    let total_number = y.len() as f64;
+
+    num_correctly_classified / total_number
+}
+
+pub fn recall<D>(x: &ArrayBase<D, Ix1>, y: &[bool]) -> f64
+    where D: Data<Elem = f64>
+{
+    let num_true_positives = x.into_iter().zip(y.into_iter())
+        .filter(|(a,b)| **a > 0.0 && **b)
+        .count() as f64;
+
+    let total_number_positives = y.iter().filter(|x| **x).count() as f64;
+
+    num_true_positives / total_number_positives
+}
+
+pub fn f1_score<D>(x: &ArrayBase<D, Ix1>, y: &[bool]) -> f64
+    where D: Data<Elem = f64>
+{
+    let recall = recall(x, y);
+    let precision = precision(x, y);
+
+    2.0 * (recall * precision) / (recall + precision)
+}