Add annoset relative tool

DancingJane · Sep 17, 2018 · 996f1bb · 996f1bb
1 parent ab6746c
commit 996f1bb
Show file tree

Hide file tree

Showing 8 changed files with 985 additions and 0 deletions.
diff --git a/data/coco/README.md b/data/coco/README.md
@@ -0,0 +1,38 @@
+### Preparation
+1. Download Images and Annotations from [MSCOCO](http://mscoco.org/dataset/#download). By default, we assume the data is stored in `$HOME/data/coco`
+
+2. Get the coco code. We will call the directory that you cloned coco into `$COCO_ROOT`
+  ```Shell
+  git clone https://github.com/weiliu89/coco.git
+  cd coco
+  git checkout dev
+  ```
+
+3. Build the coco code.
+  ```Shell
+  cd PythonAPI
+  python setup.py build_ext --inplace
+  ```
+
+4. Split the annotation to many files per image and get the image size info.
+  ```Shell
+  # Check scripts/batch_split_annotation.py and change settings accordingly.
+  python scripts/batch_split_annotation.py
+  # Create the minival2014_name_size.txt and test-dev2015_name_size.txt in $CAFFE_ROOT/data/coco
+  python scripts/batch_get_image_size.py
+  ```
+
+5. Create the LMDB file.
+  ```Shell
+  cd $CAFFE_ROOT
+  # Create the minival.txt, testdev.txt, test.txt, train.txt in data/coco/
+  python data/coco/create_list.py
+  # You can modify the parameters in create_data.sh if needed.
+  # It will create lmdb files for minival, testdev, test, and train with encoded original image:
+  #   - $HOME/data/coco/lmdb/coco_minival_lmdb
+  #   - $HOME/data/coco/lmdb/coco_testdev_lmdb
+  #   - $HOME/data/coco/lmdb/coco_test_lmdb
+  #   - $HOME/data/coco/lmdb/coco_train_lmdb
+  # and make soft links at examples/coco/
+  ./data/coco/create_data.sh
+  ```
diff --git a/data/coco/create_data.sh b/data/coco/create_data.sh
@@ -0,0 +1,29 @@
+cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
+root_dir=$cur_dir/../..
+
+cd $root_dir
+
+redo=false
+data_root_dir="$HOME/data/coco"
+dataset_name="coco"
+mapfile="$root_dir/data/$dataset_name/labelmap_coco.prototxt"
+anno_type="detection"
+label_type="json"
+db="lmdb"
+min_dim=0
+max_dim=0
+width=0
+height=0
+
+extra_cmd="--encode-type=jpg --encoded"
+if $redo
+then
+  extra_cmd="$extra_cmd --redo"
+fi
+#for subset in minival testdev train test
+for subset in minival testdev train test 
+do
+  python $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-type=$label_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim \
+  --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt \
+  $data_root_dir/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name 2>&1 | tee $root_dir/data/$dataset_name/$subset.log
+done
diff --git a/data/coco/create_list.py b/data/coco/create_list.py
@@ -0,0 +1,121 @@
+import argparse
+import os
+from random import shuffle
+import shutil
+import subprocess
+import sys
+
+HOMEDIR = os.path.expanduser("~")
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+
+# If true, re-create all list files.
+redo = True
+# The root directory which holds all information of the dataset.
+data_dir = "{}/data/coco".format(HOMEDIR)
+# The directory name which holds the image sets.
+imgset_dir = "ImageSets"
+# The direcotry which contains the images.
+img_dir = "images"
+img_ext = "jpg"
+# The directory which contains the annotations.
+anno_dir = "Annotations"
+anno_ext = "json"
+
+train_list_file = "{}/train.txt".format(CURDIR)
+minival_list_file = "{}/minival.txt".format(CURDIR)
+testdev_list_file = "{}/testdev.txt".format(CURDIR)
+test_list_file = "{}/test.txt".format(CURDIR)
+
+# Create training set.
+# We follow Ross Girschick's split.
+if redo or not os.path.exists(train_list_file):
+    datasets = ["train2014", "valminusminival2014"]
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                subset = name.split("_")[1]
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    # Shuffle the images.
+    idx = [i for i in xrange(len(img_files))]
+    shuffle(idx)
+    with open(train_list_file, "w") as f:
+        for i in idx:
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(minival_list_file):
+    datasets = ["minival2014"]
+    subset = "val2014"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(minival_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(testdev_list_file):
+    datasets = ["test-dev2015"]
+    subset = "test2015"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(testdev_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(test_list_file):
+    datasets = ["test2015"]
+    subset = "test2015"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(test_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+