personalrobotics · sriramk117 · Aug 7, 2024 · Aug 11, 2024 · Aug 13, 2024 · Aug 14, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,9 @@
 build/
 __pycache__/
 
+# Environment Variables file
+.env
+
 # Compiled Object files
 *.slo
 *.lo

diff --git a/ada_feeding_msgs/CMakeLists.txt b/ada_feeding_msgs/CMakeLists.txt
@@ -24,10 +24,12 @@ rosidl_generate_interfaces(${PROJECT_NAME}
 
   "action/AcquireFood.action"
   "action/ActivateController.action"
+  "action/GenerateCaption.action"
   "action/MoveTo.action"
   "action/MoveToConfiguration.action"
   "action/MoveToMouth.action"
   "action/SegmentAllItems.action"
+  "action/SegmentFromBox.action"
   "action/SegmentFromPoint.action"
   "action/Teleoperate.action"
   "action/Trigger.action"

diff --git a/ada_feeding_msgs/action/GenerateCaption.action b/ada_feeding_msgs/action/GenerateCaption.action
@@ -0,0 +1,28 @@
+# The interface for an action that takes in a list of input labels
+# describing the food items on a plate and returns a sentence caption compiling
+# these labels used as a query for GroundingDINO detection.
+
+# A list of semantic labels corresponding to each of the masks of detected
+# items in the image
+string[] input_labels
+---
+# Possible return statuses
+uint8 STATUS_SUCCEEDED=0
+uint8 STATUS_FAILED=1
+uint8 STATUS_CANCELED=3
+uint8 STATUS_UNKNOWN=99
+
+# Whether the vision pipeline succeeded and if not, why
+uint8 status
+
+# The header for the image that the generated caption by GPT-4o 
+# corresponds to
+std_msgs/Header header
+# The camera intrinsics
+sensor_msgs/CameraInfo camera_info
+# A sentence caption compiling the semantic labels used as a query for 
+# GroundingDINO to perform bounding box detections.
+string caption
+---
+# How much time the action has spent running inference on GPT-4o
+builtin_interfaces/Duration elapsed_time
diff --git a/ada_feeding_msgs/action/SegmentAllItems.action b/ada_feeding_msgs/action/SegmentAllItems.action
@@ -1,22 +1,27 @@
 # The interface for an action that gets an image from the camera and returns
-# the masks of all segmented items within that image.
+# the bounding boxes of all items within that image.
 
+# The list of input semantic labels for the food items on the plate
+string caption   
 ---
 # Possible return statuses
 uint8 STATUS_SUCCEEDED=0
 uint8 STATUS_FAILED=1
 uint8 STATUS_CANCELED=3
 uint8 STATUS_UNKNOWN=99
 
-# Whether the segmentation succeeded and if not, why
+# Whether the vision pipeline succeeded and if not, why
 uint8 status
 
 # The header for the image that the masks corresponds to
 std_msgs/Header header
 # The camera intrinsics
 sensor_msgs/CameraInfo camera_info
-# Masks of all the detected items in the image
-ada_feeding_msgs/Mask[] detected_items
+# Bounding boxes of all the detected items in the image
+sensor_msgs/RegionOfInterest[] detected_items
+# A list of semantic labels corresponding to each of the masks of detected
+# items in the image
+string[] item_labels
 ---
-# How much time the action has spent segmenting the food item
+# How much time the action has spent running the vision pipeline
 builtin_interfaces/Duration elapsed_time
diff --git a/ada_feeding_msgs/action/SegmentFromBox.action b/ada_feeding_msgs/action/SegmentFromBox.action
@@ -0,0 +1,28 @@
+# The interface for an action that gets an image from the camera and a bounding
+# box of the desired item to segment, and then returns the pixel-wise mask
+# of that item
+
+# The region of interest (bounding box) to seed the segmentation algorithm with
+sensor_msgs/RegionOfInterest region_of_interest
+
+# The semantic label describing the item bounded by the region of interest 
+string label
+---
+# Possible return statuses
+uint8 STATUS_SUCCEEDED=0
+uint8 STATUS_FAILED=1
+uint8 STATUS_CANCELED=3
+uint8 STATUS_UNKNOWN=99
+
+# Whether the segmentation succeeded and if not, why
+uint8 status
+
+# The header for the image that the masks corresponds to
+std_msgs/Header header
+# The camera intrinsics
+sensor_msgs/CameraInfo camera_info
+# Top contender mask segmented given a bounding box of an item
+ada_feeding_msgs/Mask detected_item
+---
+# How much time the action has spent segmenting the food item
+builtin_interfaces/Duration elapsed_time
diff --git a/ada_feeding_msgs/msg/Mask.msg b/ada_feeding_msgs/msg/Mask.msg
@@ -19,6 +19,9 @@ float64 average_depth
 # An arbitrary ID that defines the segmented item
 string item_id
 
-# A score that indicates how confident the segemntation algorithm is in
+# An ID that semantically labels a specific, segmented item
+string object_id
+
+# A score that indicates how confident the segmentation algorithm is in
 # this mask.
 float64 confidence
diff --git a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
@@ -178,6 +178,7 @@ def main(args=None):
     # We don't need to worry about the cyclic import because this is inside main().
     from ada_feeding_perception.face_detection import FaceDetectionNode
     from ada_feeding_perception.food_on_fork_detection import FoodOnForkDetectionNode
+    from ada_feeding_perception.segment_all_items import SegmentAllItemsNode
     from ada_feeding_perception.segment_from_point import SegmentFromPointNode
     from ada_feeding_perception.table_detection import TableDetectionNode
 
@@ -186,7 +187,8 @@ def main(args=None):
     node = ADAFeedingPerceptionNode("ada_feeding_perception")
     face_detection = FaceDetectionNode(node)
     food_on_fork_detection = FoodOnForkDetectionNode(node)
-    segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
+    segment_all_items = SegmentAllItemsNode(node)  # pylint: disable=unused-variable
+    # segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
     table_detection = TableDetectionNode(node)
     executor = MultiThreadedExecutor(num_threads=16)
 

diff --git a/ada_feeding_perception/ada_feeding_perception/helpers.py b/ada_feeding_perception/ada_feeding_perception/helpers.py
@@ -358,6 +358,7 @@ def get_img_msg_type(
 
     # Resolve the topic name (e.g., handle remappings)
     final_topic = node.resolve_topic_name(topic)
+    rclpy.logging.get_logger("ada_feeding_perception_helpers").info("Resolving topic name: " + final_topic)
 
     # Get the publishers on the topic
     topic_endpoints = node.get_publishers_info_by_topic(final_topic)
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,9 @@ @@
     build/
     __pycache__/
+    # Environment Variables file
+    .env
     # Compiled Object files
     *.slo
     *.lo
@@ Expand Down @@