# Copyright (c) 2022-2024 InterDigital Communications, Inc
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted (subject to the limitations in the disclaimer
# below) provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of InterDigital Communications, Inc nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""From 51 dataset into Detectron2-compatible dataset
"""
from math import floor
# import cv2
# import detectron2
import torch
# from detectron2.data import MetadataCatalog
from detectron2.structures import BoxMode
# from fiftyone import ProgressBar
from fiftyone.core.dataset import Dataset
from fiftyone.core.labels import Detection, Detections
from PIL import Image
[docs]def findLabels(dataset: Dataset, detection_field: str = "detections") -> list:
return dataset.distinct("%s.detections.label" % detection_field)
[docs]def findVideoLabels(dataset: Dataset, detection_field: str = "detections") -> list:
"""
Video datasets look like this:
::
Name: sfu-hw-objects-v1
Media type: video
Num samples: 1
Persistent: True
Tags: []
Sample fields:
id: fiftyone.core.fields.ObjectIdField
filepath: fiftyone.core.fields.StringField
tags: fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.VideoMetadata)
media_type: fiftyone.core.fields.StringField
class_tag: fiftyone.core.fields.StringField
name_tag: fiftyone.core.fields.StringField
Frame fields:
id: fiftyone.core.fields.ObjectIdField
frame_number: fiftyone.core.fields.FrameNumberField
detections: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
Frame labels can be accessed like this:
::
dataset.distinct("frames.%s.detections.label" % detection_field)
"""
return dataset.distinct("frames.%s.detections.label" % detection_field)
[docs]class FO2DetectronDataset(torch.utils.data.Dataset):
"""A class to construct a Detectron2 dataset from a FiftyOne dataset.
Subclass of ``torch.utils.data.Dataset``.
:param fo_dataset: fiftyone dataset
:param detection_field: name of member in the FiftyOne Sample where the
detector (ground truth) is put into. Default: "detections".
:param model_catids: a list of category labels as provided from Detectron2
model's metadata. Used to transform fiftyone category label into an
index number used by Detectron2
NOTE: Usually we are more interested in going from Detectron results to
FiftyOne format, so you might not use this torch Dataset class that much
refs:
- https://voxel51.com/docs/fiftyone/user_guide/using_datasets.html
- https://towardsdatascience.com/stop-wasting-time-with-pytorch-datasets-17cac2c22fa8
- https://medium.com/voxel51/how-to-train-your-dragon-detector-a35ed4672ca7
WARNING: at the moment, only detection (not segmentation) is supported
"""
def __init__(
self,
fo_dataset: Dataset = None,
detection_field="detections",
# let's use "detections" or "ground-truths" for GT and "predictions" for
# detectron2-give predictions
model_catids=[], # noqa: B006
# TODO (sampsa) "Do not use mutable data structures for argument defaults"
):
assert fo_dataset is not None, "please provide fo_dataset (fiftyone dataset)"
assert (
len(model_catids) > 0
), "please provide MODEL's ORIGINAL category label list. Get his from detectron2 model's metadata."
self.fo_dataset = fo_dataset
self.detection_field = detection_field
self.model_catids = model_catids
self.img_paths = self.fo_dataset.values(
"filepath"
) # list of all filepaths in the dataset
# Get list of distinct labels that exist in the view
# self.classes = self.fo_dataset.distinct(
# "%s.detections.label" % detection_field
# )
def __getitem__(self, idx):
img_path = self.img_paths[idx]
sample = self.fo_dataset[
img_path
] # datasets are allowed to be index with ids, filenames, etc. only (not with plain integers)
# metadata = sample.metadata
img = Image.open(img_path)
"""Example detectron2 formatted sample:
::
{'file_name': '/home/sampsa/fiftyone/openimagev6_mpeg_vcm_small_COCO/data/001997021f01f208.jpg',
'height': 1024,
'width': 760,
'image_id': 1,
'annotations': [
{'iscrowd': 0,
'bbox': [219.874232, 250.02800128, 58.95175599999998, 74.84913663999998],
'category_id': 32,
'bbox_mode': <BoxMode.XYWH_ABS: 1>},
{'iscrowd': 0,
'bbox': [430.188652, 410.87401984, 82.85117200000002, 95.55209215999997],
'category_id': 32,
'bbox_mode': <BoxMode.XYWH_ABS: 1>}
]
Example fiftyone sample:
::
<Sample: {
'id': '62e55b386fa654ded87ece8e',
'media_type': 'image',
'filepath': '/tmp/compressai-vision/data/83bf8172abed42d7.jpg',
'tags': BaseList([]),
'metadata': None,
'open_images_id': '83bf8172abed42d7',
'detections': # can have several members whose class is Detections. Typically named "ground_truth" or "prediction"
# evaluators then use both fields
<Detections: {
'detections': BaseList([
<Detection: {
'id': '62e55b386fa654ded87ecda5',
'attributes': BaseDict({}),
'tags': BaseList([]),
'label': 'bird',
'bounding_box': BaseList([
0.27138644,
0.086283185,
0.72861356,
0.913716815,
]),
'mask': None,
'confidence': None,
'index': None,
'IsOccluded': False,
'IsTruncated': False,
'IsGroupOf': False,
'IsDepiction': False,
'IsInside': False,
}>,
]),
}>,
}>
OpenImageV6 bbox format: xMin, xMax, yMin, yMax (or starting with yMin, depending what tool you use(!)
https://stackoverflow.com/questions/55832578/how-to-make-sense-of-open-images-datasets-bounding-box-annotations)
fiftyone bbox format: all relative coordinates: [x0, y0, w, h] (origo at left up)
Detectron2 bbox format: see: https://detectron2.readthedocs.io/en/latest/modules/structures.html#detectron2.structures.BoxMode
(OpenImageV6 bbox seems to be equal to BoxMode.XYWH_REL = 3)
Detectron2 visualizer tool says for BoxMode.XYWH_REAL:
::
AssertionError: Relative mode not yet supported!
So I deduce that all other than BoxMode.XYWH_ABS are in reality, useless
"""
d = {
"file_name": sample.filepath,
"height": img.height,
"width": img.width,
"image_id": sample.id,
}
annotations = []
detections = None
if sample[self.detection_field] is not None:
detections = sample[self.detection_field].detections
if detections is not None:
for detection in detections:
x_, y_, w_, h_ = detection.bounding_box
op = floor
bbox = [
op(x_ * img.width),
op(y_ * img.height),
op(w_ * img.width),
op(h_ * img.height),
]
try:
n = self.model_catids.index(detection.label)
except ValueError:
print(
"found a label name that is not in the 'model_catids' provided"
)
raise
annotations.append(
{
"iscrowd": 0,
"bbox": bbox,
# category_id = 0, # from label to catid
"category_id": n,
# "bbox_mode" : BoxMode.XYWH_REL # does not work
"bbox_mode": BoxMode.XYWH_ABS,
}
)
d["annotations"] = annotations
return d
def __len__(self):
return len(self.img_paths)
[docs]def detectron251(
res,
model_catids: list = [], # noqa: B006
# TODO (sampsa) "Do not use mutable data structures for argument defaults"
allowed_labels: list = None,
verbose=False,
) -> list:
"""Detectron2 formatted results, i.e. ``{'instances': Instances}`` into FiftyOne-formatted results
This works for detectors and instance segmentation, where a segmentation is always accompanied with a bounding box
:param res: Detectron2 predictor output (a dictionary ``{'instances': Instances}``)
:param model_catids: A category label list, as provided by Detectron2 model's metadata
Returns FiftyOne ``Detections`` instance that can be attached to a FiftyOne ``Sample`` instance.
"""
assert (
len(model_catids) > 0
), "please provide MODEL's ORIGINAL category label list. Get it from detectron2 model's metadata."
"""
Which you would do with:
::
model_dataset=cfg.DATASETS.TRAIN[0]
model_meta=MetadataCatalog.get(model_dataset)
model_meta.thing_classes
"""
instances = res["instances"]
"""
For example:
::
Instances(num_instances=5, image_height=447, image_width=1024, fields=[pred_boxes: Boxes(tensor([[130.7466, 12.2867, 962.2325, 354.4287],
[941.1395, 268.9208, 978.8932, 300.9435],
[891.1142, 275.1706, 942.4617, 299.9345],
[112.7023, 152.5475, 307.6531, 241.2931],
[815.8085, 311.5709, 898.5360, 346.5404]])),
scores: tensor([0.9964, 0.9494, 0.9204, 0.7443, 0.6773]), pred_classes: tensor([4, 7, 7, 4, 7])])
"""
dets = []
# all models give scores & pred_classes
# for bbox, score, pred_class in zip(instances.pred_boxes, instances.scores, instances.pred_classes):
for i, score in enumerate(instances.scores):
bbox = None
mask = None
class_index = instances[i].pred_classes.detach().item()
# index to label
try:
label = model_catids[class_index]
except IndexError:
print(
"model gave pred_class",
class_index,
"but the model_catids provided length is only",
len(model_catids),
)
raise
if allowed_labels is None:
pass
elif label not in allowed_labels:
if verbose:
print("detectron251: skipping label", label)
continue
# print(bbox.to("cpu").tolist(), score.to("cpu").item())
height, width = instances.image_size
"""https://voxel51.com/docs/fiftyone/api/fiftyone.core.labels.html#fiftyone.core.labels.Detection
bbox format: relative (0->1) bbox coordinates: [<top-left-x>, <top-left-y>, <width>, <height>]
mask: an instance segmentation mask for the detection within its bounding box, which should be a 2D binary or 0/1 integer numpy array
"""
# bboxes
if hasattr(instances, "pred_boxes"):
boxObject = instances.pred_boxes[i] # indexing returns a Boxes object
for t in boxObject: # noqa: B007
# TODO (sampsa) "Loop control variable 't' not used within the loop body"
# so annoying.. only way to get the tensor is to iterate
pass
x, y, x2, y2 = t.detach().tolist() # detectron2: abs coordinates
bbox = [
x / width,
y / height,
(x2 - x) / width,
(y2 - y) / height,
] # fiftyone: rex, rely, relw, relh
# print(bbox)
# segmentation
if hasattr(instances, "pred_masks"):
mask = instances.pred_masks[i].cpu().detach().numpy()
x_ = floor(x)
y_ = floor(y)
x2_ = floor(x2)
y2_ = floor(y2)
small_mask = mask[y_:y2_, x_:x2_]
if bbox is not None:
# TODO: What if we need both metrics when using Mask R-CNN?
if mask is None:
dets.append(
Detection(
label=label, confidence=score.detach().item(), bounding_box=bbox
)
)
else: # we have also a mask
dets.append(
Detection(
label=label,
confidence=score.detach().item(),
bounding_box=bbox,
mask=small_mask,
)
)
detections = Detections(detections=dets)
return detections