From 02c455d60c473a8abd460fb83da354a9d0f8a461 Mon Sep 17 00:00:00 2001
From: Ming-Yang Liu <eric612kimo@yahoo.com.tw>
Date: Tue, 12 Mar 2019 15:06:24 +0800
Subject: [PATCH] Add evaluate function

---
 examples/yolo/yolo_detect.cpp                 |   2 +-
 include/caffe/layers/annotated_data_layer.hpp |   3 +
 .../layers/segmentation_evaluate_layer.hpp    |  77 +++++++
 include/caffe/solver.hpp                      |   1 +
 .../mobilenet_yolov3_solver.prototxt          |   8 +-
 .../cityscapes/mobilenet_yolov3_test.prototxt | 214 ++++++++++++++++--
 src/caffe/layers/annotated_data_layer.cpp     |  36 ++-
 .../layers/segmentation_evaluate_layer.cpp    | 180 +++++++++++++++
 src/caffe/layers/yolo_seg.cpp                 |  11 +-
 src/caffe/proto/caffe.proto                   |   5 +
 src/caffe/solver.cpp                          | 151 +++++++++++-
 11 files changed, 655 insertions(+), 33 deletions(-)
 create mode 100644 include/caffe/layers/segmentation_evaluate_layer.hpp
 create mode 100644 src/caffe/layers/segmentation_evaluate_layer.cpp
diff --git a/examples/yolo/yolo_detect.cpp b/examples/yolo/yolo_detect.cpp
index 5267062..a43ec6a 100644
--- a/examples/yolo/yolo_detect.cpp
+++ b/examples/yolo/yolo_detect.cpp
@@ -482,7 +482,7 @@ void MatMul(cv::Mat img1, cv::Mat img2,int idx=0)
 		int img_index1 = 0;
 		int img_index2 = 0;
 		for (j = 0; j < width; j++) {
-      if(ptr2[img_index2]>20) {
+      if(ptr2[img_index2]>120) {
         ptr1[img_index1+idx] = 127 + ptr1[img_index1]/2;
       }
 			//ptr1[img_index1+idx] = (unsigned char) BOUND(ptr1[img_index1] + ptr2[img_index2] * 1.0,0,255);
diff --git a/include/caffe/layers/annotated_data_layer.hpp b/include/caffe/layers/annotated_data_layer.hpp
index 7e9c866..c181b1d 100644
--- a/include/caffe/layers/annotated_data_layer.hpp
+++ b/include/caffe/layers/annotated_data_layer.hpp
@@ -44,6 +44,9 @@ class AnnotatedDataLayer : public BasePrefetchingDataLayer<Dtype> {
   bool single_class_; //for yolo segementation
   YoloSegLabel label_map_;
   int seg_label_maxima_;
+  int seg_scales_;
+  int seg_resize_width_;
+  int seg_resize_height_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/segmentation_evaluate_layer.hpp b/include/caffe/layers/segmentation_evaluate_layer.hpp
new file mode 100644
index 0000000..250ad1b
--- /dev/null
+++ b/include/caffe/layers/segmentation_evaluate_layer.hpp
@@ -0,0 +1,77 @@
+/*
+* @Author: Eric612
+* @Date:   2019-03-11
+* @https://github.com/eric612/Caffe-YOLOv3-Windows
+* @https://github.com/eric612/MobileNet-YOLO
+* Avisonic , ELAN microelectronics
+*/
+#ifdef USE_OPENCV
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif  // USE_OPENCV
+#ifndef CAFFE_SEGMENTATION_EVALUATE_LAYER_HPP_
+#define CAFFE_SEGMENTATION_EVALUATE_LAYER_HPP_
+
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Generate the detection evaluation based on DetectionOutputLayer and
+ * ground truth bounding box labels.
+ *
+ * Intended for use with MultiBox detection method.
+ *
+ * NOTE: does not implement Backwards operation.
+ */
+template <typename Dtype>
+class SegmentationEvaluateLayer : public Layer<Dtype> {
+ public:
+  explicit SegmentationEvaluateLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "DetectionEvaluate"; }
+  virtual inline int ExactBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @brief Evaluate the detection output.
+   *
+   * @param bottom input Blob vector (exact 2)
+   *   -# @f$ (1 \times 1 \times N \times 7) @f$
+   *      N detection results.
+   *   -# @f$ (1 \times 1 \times M \times 7) @f$
+   *      M ground truth.
+   * @param top Blob vector (length 1)
+   *   -# @f$ (1 \times 1 \times N \times 4) @f$
+   *      N is the number of detections, and each row is:
+   *      [image_id, label, confidence, true_pos, false_pos]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  /// @brief Not implemented
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    NOT_IMPLEMENTED;
+  }
+  void visualization(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+  int num_classes_;
+  //std::vector<cv::Mat> seg_img_;
+  float threshold_;
+  int iter_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SEGMENTATION_EVALUATE_LAYER_HPP_
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 253ab1b..9aa9aa3 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -105,6 +105,7 @@ class Solver {
   void TestAll();
   void TestClassification(const int test_net_id = 0);
   void TestDetection(const int test_net_id = 0);
+  void TestDetectionSeg(const int test_net_id = 0);
   virtual void SnapshotSolverState(const string& model_filename) = 0;
   virtual void RestoreSolverStateFromHDF5(const string& state_file) = 0;
   virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
diff --git a/models/cityscapes/mobilenet_yolov3_solver.prototxt b/models/cityscapes/mobilenet_yolov3_solver.prototxt
index bb67a88..c15892f 100644
--- a/models/cityscapes/mobilenet_yolov3_solver.prototxt
+++ b/models/cityscapes/mobilenet_yolov3_solver.prototxt
@@ -1,7 +1,7 @@
 train_net: "models/cityscapes/mobilenet_yolov3_train.prototxt"
-#test_net: "models/cityscapes/mobilenet_yolov3_test.prototxt"
-#test_iter: 2000
-#test_interval: 1000
+test_net: "models/cityscapes/mobilenet_yolov3_test.prototxt"
+test_iter: 500
+test_interval: 1000
 base_lr: 0.0005
 display: 10
 max_iter: 120000
@@ -20,6 +20,6 @@ stepvalue: 40000
 stepvalue: 60000
 iter_size: 16
 type: "RMSProp"
-eval_type: "detection"
+eval_type: "detection_segmentation"
 ap_version: "11point"
 show_per_class_result: true
\ No newline at end of file
diff --git a/models/cityscapes/mobilenet_yolov3_test.prototxt b/models/cityscapes/mobilenet_yolov3_test.prototxt
index 0ae3c33..6533583 100644
--- a/models/cityscapes/mobilenet_yolov3_test.prototxt
+++ b/models/cityscapes/mobilenet_yolov3_test.prototxt
@@ -4,6 +4,7 @@ layer {
   type: "AnnotatedData"
   top: "data"
   top: "label"
+  top: "seg_label"
   include {
     phase: TEST
   }
@@ -22,14 +23,17 @@ layer {
     }
   }
   data_param {
-    source: "examples/bus/bus_test_lmdb"
+    source: "examples/cityscapes/cityscapes_val_lmdb"
     batch_size: 1
     backend: LMDB
   }
   annotated_data_param {
+    single_class : false
+    #seg_resize_width : 2048
+    #seg_resize_height : 1024
     batch_sampler {
     }
-    label_map_file: "data/VOC0712/labelmap_voc.prototxt"
+    label_map_file:  "data/cityscapes/labelmap_seg.prototxt"
   }
 }
 
@@ -2911,24 +2915,23 @@ layer {
 	#10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
 	
     biases: 4
-    biases: 9
+    biases: 7
     biases: 6
-    biases: 15
+    biases: 13
     biases: 8
-    biases: 22
-    biases: 12
-    biases: 31
+    biases: 21
+    biases: 18
     biases: 16
-    biases: 42
-    biases: 22
-    biases: 57
-    biases: 32
-    biases: 81
-    biases: 50
-    biases: 124
-    biases: 82
-    biases: 207 
-	
+    biases: 12
+    biases: 33
+    biases: 20
+    biases: 47
+    biases: 30
+    biases: 71
+    biases: 46
+    biases: 112
+    biases: 78
+    biases: 193    
     mask:6
     mask:7
     mask:8
@@ -2944,7 +2947,6 @@ layer {
     mask_group_num:3
   }
 }
-
 layer {
   name: "detection_eval"
   type: "DetectionEvaluate"
@@ -2960,3 +2962,179 @@ layer {
   }
 }
 
+layer {
+  name: "conv25/dw"
+  type: "DepthwiseConvolution"
+  bottom: "conv20/sum"
+  top: "conv25/dw"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    group: 256
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+  }
+}
+layer {
+  name: "conv25/dw/bn"
+  type: "BatchNorm"
+  bottom: "conv25/dw"
+  top: "conv25/dw"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+}
+layer {
+  name: "conv25/dw/scale"
+  type: "Scale"
+  bottom: "conv25/dw"
+  top: "conv25/dw"
+  param {
+    lr_mult: 1
+    decay_mult: 0.0
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0.0
+  }
+  scale_param {
+    filler {
+      value: 1
+    }
+    bias_term: true
+    bias_filler {
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv25/dw/relu"
+  type: "ReLU"
+  bottom: "conv25/dw"
+  top: "conv25/dw"
+}
+layer {
+  name: "conv25"
+  type: "Convolution"
+  bottom: "conv25/dw"
+  top: "conv25"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  convolution_param {
+    num_output: 256 
+    bias_term: false
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+  }
+}
+layer {
+  name: "conv25/bn"
+  type: "BatchNorm"
+  bottom: "conv25"
+  top: "conv25"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+}
+layer {
+  name: "conv25/scale"
+  type: "Scale"
+  bottom: "conv25"
+  top: "conv25"
+  param {
+    lr_mult: 1.0
+    decay_mult: 0.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  scale_param {
+    filler {
+      value: 1
+    }
+    bias_term: true
+    bias_filler {
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv25/relu"
+  type: "ReLU"
+  bottom: "conv25"
+  top: "conv25"
+}
+layer {
+  name: "conv26"
+  type: "Convolution"
+  bottom: "conv25"
+  top: "conv26"
+    param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 3 # channel = class 
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+	bias_filler {
+      value: 0
+    }
+  }
+}
+layer {
+  name: "sigmoid"
+  bottom: "conv26"
+  top: "sigmoid"
+  type: "Sigmoid"
+}
+layer {
+  name: "segmentation_eval"
+  type: "SegmentationEvaluate"
+  bottom: "sigmoid"
+  bottom: "seg_label"
+  top: "segmentation_eval"
+
+  segmentation_evaluate_param {
+    num_classes: 3
+  }
+}
\ No newline at end of file
diff --git a/src/caffe/layers/annotated_data_layer.cpp b/src/caffe/layers/annotated_data_layer.cpp
index f835ed6..a019ef6 100644
--- a/src/caffe/layers/annotated_data_layer.cpp
+++ b/src/caffe/layers/annotated_data_layer.cpp
@@ -44,6 +44,9 @@ void AnnotatedDataLayer<Dtype>::DataLayerSetUp(
   yolo_data_jitter_ = anno_data_param.yolo_data_jitter();
   train_diffcult_ = anno_data_param.train_diffcult();
   single_class_ = anno_data_param.single_class();
+  seg_scales_ = anno_data_param.seg_scales();
+  seg_resize_width_ = anno_data_param.seg_resize_width();
+  seg_resize_height_ = anno_data_param.seg_resize_height();
   // Make sure dimension is consistent within batch.
   const TransformationParameter& transform_param =
     this->layer_param_.transform_param();
@@ -150,8 +153,15 @@ void AnnotatedDataLayer<Dtype>::DataLayerSetUp(
 	  vector<int> seg_label_shape(4, 1);
 	  seg_label_shape[0] = batch_size;
 	  seg_label_shape[1] = maxima;
-	  seg_label_shape[2] = top_shape[2] / 8;
-	  seg_label_shape[3] = top_shape[3] / 8;
+    if(seg_resize_width_==0 || seg_resize_height_==0) {
+      seg_label_shape[2] = top_shape[2] / seg_scales_;
+      seg_label_shape[3] = top_shape[3] / seg_scales_;
+    }      
+    else {
+      seg_label_shape[2] = seg_resize_width_;
+      seg_label_shape[3] = seg_resize_height_;  
+    }
+    LOG(INFO)<<seg_label_shape[0]<<","<<seg_label_shape[1]<<","<<seg_label_shape[2]<<","<<seg_label_shape[3];
 	  top[2]->Reshape(seg_label_shape);
 	  for (int i = 0; i < this->prefetch_.size(); ++i) {
 		  this->prefetch_[i]->seg_label_.Reshape(seg_label_shape);
@@ -362,15 +372,24 @@ void AnnotatedDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
                                          &(this->transformed_data_));
     }
     if (this->output_seg_labels_) {
+      cv::Mat cv_lab = DecodeDatumToCVMatSeg(anno_datum.datum(), false);
       //LOG(INFO)<<iters_*batch_size + item_id;
       vector<int> seg_label_shape(4);
       seg_label_shape[0] = batch_size;
       seg_label_shape[1] = seg_label_maxima_;
-      seg_label_shape[2] = top_shape[2] / 8;
-      seg_label_shape[3] = top_shape[3] / 8;
+      
+      if(seg_resize_width_==0 || seg_resize_height_==0) {
+        seg_label_shape[2] = top_shape[2] / seg_scales_;
+        seg_label_shape[3] = top_shape[3] / seg_scales_;
+      }
+      else {
+        seg_label_shape[2] = seg_resize_height_;
+        seg_label_shape[3] = seg_resize_width_;        
+      }
+      //LOG(INFO)<<seg_resize_width_<<","<<seg_resize_height_;
       batch->seg_label_.Reshape(seg_label_shape);
       //caffe_set<Dtype>(8, 0, batch->seg_label_.mutable_cpu_data());
-      cv::Mat cv_lab = DecodeDatumToCVMatSeg(anno_datum.datum(), false);
+      
       
       cv::Mat crop_img;
       //LOG(INFO) << crop_box.xmin() << crop_box.xmax() << crop_box.ymin() << crop_box.ymax();
@@ -432,9 +451,9 @@ void AnnotatedDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
           
           cv::LUT(crop_img,table,binary_img);
           //LOG(INFO)<<type2str(binary_img.type());
-          cv::resize(binary_img, resized, cv::Size(seg_label_shape[2], seg_label_shape[3]),cv::INTER_AREA);
+          cv::resize(binary_img, resized, cv::Size(seg_label_shape[3], seg_label_shape[2]),cv::INTER_AREA);
           //cv::threshold(resized, resized, 100, 255, cv::THRESH_BINARY);
-
+          //LOG(INFO)<<binary_img.cols<<","<<binary_img.rows;
           channels.push_back(resized);
           
           /*if(true) {
@@ -464,6 +483,9 @@ void AnnotatedDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
         //LOG(INFO)<<type2str(merged.type());
         //LOG(INFO)<<merged.channels()<<","<<merged.rows<<","<<merged.cols;
         this->transformed_label_.Reshape(seg_label_shape);
+        //LOG(INFO)<<batch->seg_label_.width()<<","<<batch->seg_label_.height();
+        //LOG(INFO)<<this->transformed_label_.width()<<","<<this->transformed_label_.height();
+        //LOG(INFO)<<cv_lab.cols<<","<<cv_lab.rows;
         //LOG(INFO)<<seg_label_shape[0]<<","<<seg_label_shape[1]<<","<<seg_label_shape[2]<<","<<seg_label_shape[3];
         //LOG(INFO)<<this->transformed_label_.num()<<","<<this->transformed_label_.channels()<<","<<this->transformed_label_.width()<<","<<this->transformed_label_.height();      
         int offset = batch->seg_label_.offset(item_id);
diff --git a/src/caffe/layers/segmentation_evaluate_layer.cpp b/src/caffe/layers/segmentation_evaluate_layer.cpp
new file mode 100644
index 0000000..260c27e
--- /dev/null
+++ b/src/caffe/layers/segmentation_evaluate_layer.cpp
@@ -0,0 +1,180 @@
+/*
+* @Author: Eric612
+* @Date:   2019-03-11
+* @https://github.com/eric612/Caffe-YOLOv3-Windows
+* @https://github.com/eric612/MobileNet-YOLO
+* Avisonic , ELAN microelectronics
+*/
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/layers/segmentation_evaluate_layer.hpp"
+#include "caffe/util/bbox_util.hpp"
+#define BOUND(a,min_val,max_val)           ( (a < min_val) ? min_val : (a >= max_val) ? (max_val) : a )
+namespace caffe {
+
+template <typename Dtype>
+void SegmentationEvaluateLayer<Dtype>::LayerSetUp(
+      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const SegmentationEvaluateParameter& segmentation_evaluate_param =
+      this->layer_param_.segmentation_evaluate_param();
+  CHECK(segmentation_evaluate_param.has_num_classes())
+      << "Must provide num_classes.";
+  num_classes_ = segmentation_evaluate_param.num_classes();
+  threshold_ = segmentation_evaluate_param.threshold();
+  iter_ = 0;
+}
+
+template <typename Dtype>
+void SegmentationEvaluateLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  vector<int> top_shape(2, 1);
+  top_shape.push_back(1);
+  int width = bottom[1]->width();
+  int height = bottom[1]->height();
+  int size = bottom[1]->channels();
+  top_shape.push_back(size);
+  top[0]->Reshape(top_shape);
+  
+}
+template <typename Dtype>
+void SegmentationEvaluateLayer<Dtype>::visualization(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
+{
+  int w = bottom[0]->width();
+  int h = bottom[0]->height();
+  cv::Mat img2(h, w, CV_8UC1);
+  uchar* ptr2;
+  int img_index1 = 0;
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* label_data = bottom[1]->cpu_data(); 
+  for (int y = 0; y < h; y++) {
+    uchar* ptr2 = img2.ptr<uchar>(y);
+    int img_index2 = 0;
+    for (int j = 0; j < w; j++)
+    {
+      //LOG(INFO)<<(int)(bottom_data[img_index1] * 255);
+      ptr2[img_index2] = (unsigned char)((bottom_data[img_index1]) * 255);
+      
+      //ptr2[img_index2] = (unsigned char)((label_data[img_index1]) * 255);
+      img_index1++;
+      img_index2++;
+    }
+  }
+  //cv::imwrite("test.jpg",img2);
+  cv::namedWindow("show", cv::WINDOW_NORMAL);
+  cv::resizeWindow("show", 800, 400);
+  cv::imshow("show", img2);
+  cv::waitKey(1);
+}
+template <typename Dtype>
+void SegmentationEvaluateLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  
+  const Dtype* seg_data = bottom[0]->cpu_data();
+  const Dtype* gt_data = bottom[1]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int size = bottom[0]->channels();
+  
+  int width = bottom[0]->width();
+  int height = bottom[0]->height();
+  int img_index1 = 0;
+  int eval_width = bottom[1]->width();
+  int eval_height = bottom[1]->height();
+  float iou = 0;
+  //visualization(bottom,top);
+  if(width == eval_width && height == eval_height) {
+
+    int count = bottom[0]->count();
+    int len = width*height;
+    for(int c = 0; c<size;c++) {   
+      int gt_pixel_num = 0;
+      int match_pixel_num = 0;
+      int eval_pixel_num = 0;    
+      for (int i = 0; i < height ; i++) {
+        for (int j = 0; j < width ; j++) {
+          int index = c*len + i*width + j;
+          if(gt_data[index]>threshold_) {
+            gt_pixel_num++;
+            if(seg_data[index]>threshold_) {
+              match_pixel_num++;
+            }
+          }
+          if(seg_data[index]>threshold_) {
+            eval_pixel_num++;
+          }
+        }
+      }  
+      if(match_pixel_num)
+        iou = (float) match_pixel_num / (float)(gt_pixel_num + eval_pixel_num - match_pixel_num);
+      else
+        iou = 0;
+      top_data[c] = iou;
+      //LOG(INFO) << "class" << c << " : "<<iou;      
+    }
+
+  }
+  else {
+    cv::Mat tmp_img(height, width, CV_8UC1);
+    cv::Mat eval_img(eval_height, eval_width, CV_8UC1);
+    int len1,len2;
+    
+    for(int c = 0; c<size;c++) {   
+      int img_index1=0;      
+      len1 = width*height;
+      for (int i = 0; i < height; i++) {
+        uchar* ptr2 = tmp_img.ptr<uchar>(i);
+        
+        int img_index2 = 0;
+        for (int j = 0; j < width; j++) {
+          ptr2[img_index2] = (unsigned char)BOUND((unsigned char)((seg_data[img_index1+c*len1]) * 255),0,255);
+          img_index1++;
+          img_index2++;
+          //LOG(INFO)<<img_index1;
+        }
+      }
+      
+      cv::resize(tmp_img, eval_img, cv::Size(eval_width, eval_height),cv::INTER_AREA);
+      int gt_pixel_num = 0;
+      int match_pixel_num = 0;
+      int eval_pixel_num = 0;  
+      int th = threshold_*255;
+      len2 = eval_width*eval_height;
+      for (int i = 0; i < eval_height; i++) {
+        const unsigned char* ptr = eval_img.ptr<unsigned char>(i);
+        int img_index = 0;
+        for (int j = 0; j < eval_width; j++) {
+          int index = c*len2 + i*eval_width + j;
+          //LOG(INFO)<<(int)gt_data[index];
+          if(gt_data[index]>threshold_) {
+            gt_pixel_num++;
+            if(ptr[img_index]>th) {
+              match_pixel_num++;
+            }
+          }
+          if(ptr[img_index]>th) {
+            eval_pixel_num++;
+          }
+          img_index++;
+        }
+      }
+      if(match_pixel_num)
+        iou = (float) match_pixel_num / (float)(gt_pixel_num + eval_pixel_num - match_pixel_num);
+      else
+        iou = 0;
+      top_data[c] = iou;
+      //LOG(INFO)  <<"gt_pixel : " << gt_pixel_num<< " , match_pixel : " << match_pixel_num<< " , eval_pixel : " << eval_pixel_num;
+    }
+  }
+  //cv::imwrite("test.jpg",seg_img_[0]);
+  //LOG(INFO)<<bottom[0]->num()<<" , "<<bottom[0]->channels()<<" , "<<bottom[0]->width()<<" , "<<bottom[0]->height();
+  //LOG(INFO)<<bottom[1]->num()<<" , "<<bottom[1]->channels()<<" , "<<bottom[1]->width()<<" , "<<bottom[1]->height();
+  iter_++;
+}
+
+INSTANTIATE_CLASS(SegmentationEvaluateLayer);
+REGISTER_LAYER_CLASS(SegmentationEvaluate);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/yolo_seg.cpp b/src/caffe/layers/yolo_seg.cpp
index 99debb4..6266c2e 100644
--- a/src/caffe/layers/yolo_seg.cpp
+++ b/src/caffe/layers/yolo_seg.cpp
@@ -46,6 +46,7 @@ void YoloSegLayer<Dtype>::Reshape(
   CHECK_EQ(bottom[0]->count(), bottom[0]->count()) <<
       "YoloSeg layer inputs must have the same count.";
   diff_.ReshapeLike(*bottom[0]);
+  swap_.ReshapeLike(*bottom[0]);
 }
 template <typename Dtype>
 void YoloSegLayer<Dtype>::visualization(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
@@ -56,13 +57,16 @@ void YoloSegLayer<Dtype>::visualization(const vector<Blob<Dtype>*>& bottom,const
   uchar* ptr2;
   int img_index1 = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* label_data = bottom[1]->cpu_data(); 
   for (int y = 0; y < h; y++) {
     uchar* ptr2 = img2.ptr<uchar>(y);
     int img_index2 = 0;
     for (int j = 0; j < w; j++)
     {
-      ptr2[img_index2] = (unsigned char)(sigmoid(bottom_data[img_index1]) * 255);
-
+      //LOG(INFO)<<(int)(bottom_data[img_index1] * 255);
+      ptr2[img_index2] = (unsigned char)(sigmoid(bottom_data[img_index1+w*h]) * 255);
+      
+      //ptr2[img_index2] = (unsigned char)((label_data[img_index1+w*h]) * 255);
       img_index1++;
       img_index2++;
     }
@@ -76,14 +80,17 @@ void YoloSegLayer<Dtype>::visualization(const vector<Blob<Dtype>*>& bottom,const
 template <typename Dtype>
 void YoloSegLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
+  //LOG(INFO)<<bottom[1]->channels()<<","<<bottom[1]->num()<<","<<bottom[1]->width()<<","<<bottom[1]->height();
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
   const int count = bottom[0]->count();
   const Dtype* label_data = bottom[1]->cpu_data(); //[label,x,y,w,h]
   if (diff_.width() != bottom[0]->width()) {
     diff_.ReshapeLike(*bottom[0]);
+    swap_.ReshapeLike(*bottom[0]);
   }
   Dtype* diff = diff_.mutable_cpu_data();
+  Dtype* swap = swap_.mutable_cpu_data();
   caffe_set(diff_.count(), Dtype(0.0), diff);
   Dtype loss(0.0);
   //LOG(INFO) << object_scale_;
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index c63e811..f20ed40 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -584,6 +584,7 @@ message LayerParameter {
   optional CropParameter crop_param = 144;
   optional DataParameter data_param = 107;
   optional DetectionEvaluateParameter detection_evaluate_param = 205;
+  optional SegmentationEvaluateParameter segmentation_evaluate_param = 605;
   optional YoloDetectionOutputParameter yolo_detection_output_param = 601;
   optional Yolov3DetectionOutputParameter yolov3_detection_output_param = 602;
   optional DropoutParameter dropout_param = 108;
@@ -852,6 +853,10 @@ message AnnotatedDataParameter {
   optional int32 yolo_data_type = 4 [default = 0];
   optional float yolo_data_jitter = 5 [default = 0.3];
   optional bool train_diffcult = 6 [default = false];
+  optional bool single_class =7 [default = true]; // for yolo segementation
+  optional int32 seg_scales = 8 [default = 8]; // for yolo segementation
+  optional int32 seg_resize_width = 9 [default = 0]; // for yolo segementation
+  optional int32 seg_resize_height = 10 [default = 0]; // for yolo segementation
 }
 
 message ArgMaxParameter {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 98194fa..32e5e13 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -354,6 +354,8 @@ void Solver<Dtype>::TestAll() {
       TestClassification(test_net_id);
     } else if (param_.eval_type() == "detection") {
       TestDetection(test_net_id);
+    } else if (param_.eval_type() == "detection_segmentation") {
+      TestDetectionSeg(test_net_id);
     } else {
       LOG(FATAL) << "Unknown evaluation type: " << param_.eval_type();
     }
@@ -434,7 +436,6 @@ void Solver<Dtype>::TestClassification(const int test_net_id) {
               << mean_score << loss_msg_stream.str();
   }
 }
-
 template <typename Dtype>
 void Solver<Dtype>::TestDetection(const int test_net_id) {
   CHECK(Caffe::root_solver());
@@ -555,6 +556,154 @@ void Solver<Dtype>::TestDetection(const int test_net_id) {
               << mAP;
   }
 }
+template <typename Dtype>
+void Solver<Dtype>::TestDetectionSeg(const int test_net_id) {
+  CHECK(Caffe::root_solver());
+  LOG(INFO) << "Iteration " << iter_
+            << ", Testing net (#" << test_net_id << ")";
+  CHECK_NOTNULL(test_nets_[test_net_id].get())->
+      ShareTrainedLayersWith(net_.get());
+  map<int, map<int, vector<pair<float, int> > > > all_true_pos;
+  map<int, map<int, vector<pair<float, int> > > > all_false_pos;
+  map<int, map<int, int> > all_num_pos;
+  const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
+  Dtype loss = 0;
+  float *iou = NULL;
+  int count = 0;
+  int classes = 1;
+  for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
+    SolverAction::Enum request = GetRequestedAction();
+    // Check to see if stoppage of testing/training has been requested.
+    while (request != SolverAction::NONE) {
+        if (SolverAction::SNAPSHOT == request) {
+          Snapshot();
+        } else if (SolverAction::STOP == request) {
+          requested_early_exit_ = true;
+        }
+        request = GetRequestedAction();
+    }
+    if (requested_early_exit_) {
+      // break out of test loop.
+      break;
+    }
+
+    Dtype iter_loss;
+    const vector<Blob<Dtype>*>& result = test_net->Forward(&iter_loss);
+    if (param_.test_compute_loss()) {
+      loss += iter_loss;
+    }
+    for (int j = 0; j < 2; ++j) {
+      if(j==0) {
+        CHECK_EQ(result[j]->width(), 5);
+        const Dtype* result_vec = result[j]->cpu_data();
+        int num_det = result[j]->height();
+        for (int k = 0; k < num_det; ++k) {
+          int item_id = static_cast<int>(result_vec[k * 5]);
+          int label = static_cast<int>(result_vec[k * 5 + 1]);
+          if (item_id == -1) {
+            // Special row of storing number of positives for a label.
+            if (all_num_pos[j].find(label) == all_num_pos[j].end()) {
+              all_num_pos[j][label] = static_cast<int>(result_vec[k * 5 + 2]);
+            } else {
+              all_num_pos[j][label] += static_cast<int>(result_vec[k * 5 + 2]);
+            }
+          } else {
+            // Normal row storing detection status.
+            float score = result_vec[k * 5 + 2];
+            int tp = static_cast<int>(result_vec[k * 5 + 3]);
+            int fp = static_cast<int>(result_vec[k * 5 + 4]);
+            if (tp == 0 && fp == 0) {
+              // Ignore such case. It happens when a detection bbox is matched to
+              // a difficult gt bbox and we don't evaluate on difficult gt bbox.
+              continue;
+            }
+            all_true_pos[j][label].push_back(std::make_pair(score, tp));
+            all_false_pos[j][label].push_back(std::make_pair(score, fp));
+          }
+        }
+      }
+      else if(j==1) {
+        //LOG(INFO)<<result[1]->width()<<" , "<<result[1]->height()<<" , "<<result[1]->num()<<" , "<<result[1]->channels();
+        const Dtype* iou_data = result[1]->cpu_data();
+        classes = result[1]->width();
+        //LOG(INFO) << classes;
+        if(!iou) {
+          iou = new float[classes];
+          memset(iou,0,4*classes);
+        }
+        for(int i=0;i<classes;i++) {
+          iou[i] += iou_data[i];
+        }
+        count++;
+        //LOG(INFO)<< iou_data[0];
+      }
+    }    
+  }
+  float mIOU = 0;
+  for(int i=0;i<classes;i++) {
+    float eval_iou = iou[i] / (float)count;
+    LOG(INFO)<< "Seg Classes " << i << " IOU : " << eval_iou;
+    mIOU += eval_iou;
+  }
+  LOG(INFO)<< "Seg mIOU : " << mIOU/(float)classes;
+  delete[] iou;
+  if (requested_early_exit_) {
+    LOG(INFO)     << "Test interrupted.";
+    return;
+  }
+  if (param_.test_compute_loss()) {
+    loss /= param_.test_iter(test_net_id);
+    LOG(INFO) << "Test loss: " << loss;
+  }
+  for (int i = 0; i < all_true_pos.size(); ++i) {
+    if (all_true_pos.find(i) == all_true_pos.end()) {
+      LOG(FATAL) << "Missing output_blob true_pos: " << i;
+    }
+    const map<int, vector<pair<float, int> > >& true_pos =
+        all_true_pos.find(i)->second;
+    if (all_false_pos.find(i) == all_false_pos.end()) {
+      LOG(FATAL) << "Missing output_blob false_pos: " << i;
+    }
+    const map<int, vector<pair<float, int> > >& false_pos =
+        all_false_pos.find(i)->second;
+    if (all_num_pos.find(i) == all_num_pos.end()) {
+      LOG(FATAL) << "Missing output_blob num_pos: " << i;
+    }
+    const map<int, int>& num_pos = all_num_pos.find(i)->second;
+    map<int, float> APs;
+    float mAP = 0.;
+    // Sort true_pos and false_pos with descend scores.
+    for (map<int, int>::const_iterator it = num_pos.begin();
+         it != num_pos.end(); ++it) {
+      int label = it->first;
+      int label_num_pos = it->second;
+      if (true_pos.find(label) == true_pos.end()) {
+        LOG(WARNING) << "Missing true_pos for label: " << label;
+        continue;
+      }
+      const vector<pair<float, int> >& label_true_pos =
+          true_pos.find(label)->second;
+      if (false_pos.find(label) == false_pos.end()) {
+        LOG(WARNING) << "Missing false_pos for label: " << label;
+        continue;
+      }
+      const vector<pair<float, int> >& label_false_pos =
+          false_pos.find(label)->second;
+      vector<float> prec, rec;
+      ComputeAP(label_true_pos, label_num_pos, label_false_pos,
+                param_.ap_version(), &prec, &rec, &(APs[label]));
+      mAP += APs[label];
+      if (param_.show_per_class_result()) {
+        LOG(INFO) << "class" << label << ": " << APs[label];
+      }
+    }
+    mAP /= num_pos.size();
+    const int output_blob_index = test_net->output_blob_indices()[i];
+    const string& output_name = test_net->blob_names()[output_blob_index];
+    LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
+              << mAP;
+  }
+}
 
 template <typename Dtype>
 void Solver<Dtype>::Snapshot() {