{
  "title": "Imitrob dataset v.2.0",
  "description": "Imitrob is a real-world benchmark dataset with 6D annotations designed for imitation learning, featuring human operators holding tools and demonstrating various tasks. The dataset spans three hand-held tools (a glue gun, a grout float and a roller), four human subjects, left and right hand scenarios, two camera viewpoints and various number of tasks for each tool. The image sequences are completely annotated with 6D pose of the tool, coordinates of the end effector etc. The spatial information about the tools was collected from a HTC Vive controller attached on top of them.\n\nThe dataset consists of two main subsets: Test, showing the tools \"in action\" in industrial-like environments (along with ground truth annotations and evaluation metrics), and Train, showing the tools being manipulated in front of a green background (supplied with augmentation methods). Train contains 39 326 images in total and can be used for algorithm training, Test (61 660 images) is suitable for evaluation.\n\nThe main idea behind Imitrob is to simulate real-world industrial usecases and their usual demands on the 6D object pose estimation algorithms: high precision of the pose estimation under heavy object occlusions, absence of the object 3D model (which is difficult to provide), handling of noisy/cluttered environments etc. It should thus help the users estimate the plausibility of a selected pose estimation algorithm for their own project. ",
  "keywords": [
    "6D object pose estimation",
    "hand-held tools",
    "imitation learning",
    "manipulation task",
    "real-world dataset",
    "benchmarking"
  ],
  "created": "2021",
  "modified": "2022",
  "publisher": "CIIRC CTU in Prague",
  "contact": {
    "name": "Karla Stepanova",
    "e-mail": "karla.stepanova@cvut.cz"
  },
  "URI": "http://imitrob.ciirc.cvut.cz/imitrobdataset.php",
  "identifier": "",
  "access level": "public",
  "license": "CC BY-NC-SA 4.0 license",
  "bibliographic citation": "@Misc{imitrobdataset,author =   {{CIIRC CTU in Prague}},title =    {{I}mitrob dataset version 2.0},howpublished = {\\url{http://imitrob.ciirc.cvut.cz/imitrobdataset.php}},year = 2022}",
  "supplementary code URL": "https://github.com/imitrob/imitrob_dataset_code",
  "category": "machine learning",
  "acquisition hardware:": {
    "cameras": {
        "manufacturer": "Intel",
        "model": "RealSense D-435",
        "resolution": {
            "width": 848,
            "height": 480
        },
        "framerate": 60
    },
    "pose acquisition": {
        "manufacturer": "HTC",
        "model": "Vive VR",
        "data frequency (Hz)": 30
    }
  },
  "subsets": {
    "ImitrobTest": {
      "description": "The Test subset contains 56 video sequences depicting a human operator using one of the tools to perform a certain task",
      "number of instances": "61 660",
      "data": {
        "6DOF/": "contains .json files with 6D pose data of the recorded tool for each video frame",
        "BBox/": "contains the bounding box coordinates (again in .json)",
        "Depth/": "contains the depth images of the whole scene",
        "Image/": "contains RGB image of each frame",
        "parameters.json": "contains coordinates of the 8 vertices of the bounding box for the tool with respect to the HTC Vive Tracker and intrinsic camera matrices for Camera 1 (KC1 ) and 2 (KC2 )."
      }
    },
    "ImitrobTrain": {
      "description": "The Train subset consists of 48 video sequences altogether, capturing each one of the three tools being manipulated randomly against a green background. There is a HTC Vive tracker attached on top of the tool to capture the 6D pose of the object. The data provided for this subset are the same as in the Test subset, plus two additional Mask and Mask_thresholding folders.",
      "number of instances": "39 326",
      "data": {
        "6DOF/": "contains .json files with 6D pose data of the recorded tool for each video frame",
        "BBox/": "contains the bounding box coordinates (again in .json)",
        "Depth/": "contains the depth images of the whole scene",
        "Image/": "contains RGB image of each frame",
        "Mask/": "contains only the segmented tool with a hand and transparent background",
        "Mask_thresholding/": "provides binary mask segmentations of the tools",
        "parameters.json": "contains coordinates of the 8 vertices of the bounding box for the tool with respect to the HTC Vive Tracker  and intrinsic camera matrices for Camera 1 (KC1) and 2 (KC2)."
      }
    }
  }
}