Source code for compressai_vision.pipelines.fo_vcm.conversion.detectron2

# Copyright (c) 2022-2024 InterDigital Communications, Inc
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted (subject to the limitations in the disclaimer
# below) provided that the following conditions are met:

# * Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
# * Neither the name of InterDigital Communications, Inc nor the names of its
#   contributors may be used to endorse or promote products derived from this
#   software without specific prior written permission.

# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""From 51 dataset into Detectron2-compatible dataset
"""
from math import floor

# import cv2
# import detectron2
import torch

# from detectron2.data import MetadataCatalog
from detectron2.structures import BoxMode

# from fiftyone import ProgressBar
from fiftyone.core.dataset import Dataset
from fiftyone.core.labels import Detection, Detections
from PIL import Image


[docs]def findLabels(dataset: Dataset, detection_field: str = "detections") -> list:
    return dataset.distinct("%s.detections.label" % detection_field)


[docs]def findVideoLabels(dataset: Dataset, detection_field: str = "detections") -> list:
    """

    Video datasets look like this:

    ::

        Name:        sfu-hw-objects-v1
        Media type:  video
        Num samples: 1
        Persistent:  True
        Tags:        []
        Sample fields:
            id:         fiftyone.core.fields.ObjectIdField
            filepath:   fiftyone.core.fields.StringField
            tags:       fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
            metadata:   fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.VideoMetadata)
            media_type: fiftyone.core.fields.StringField
            class_tag:  fiftyone.core.fields.StringField
            name_tag:   fiftyone.core.fields.StringField
        Frame fields:
            id:           fiftyone.core.fields.ObjectIdField
            frame_number: fiftyone.core.fields.FrameNumberField
            detections:   fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)

    Frame labels can be accessed like this:

    ::

        dataset.distinct("frames.%s.detections.label" % detection_field)
    """
    return dataset.distinct("frames.%s.detections.label" % detection_field)


[docs]class FO2DetectronDataset(torch.utils.data.Dataset):
    """A class to construct a Detectron2 dataset from a FiftyOne dataset.
    Subclass of ``torch.utils.data.Dataset``.

    :param fo_dataset: fiftyone dataset
    :param detection_field: name of member in the FiftyOne Sample where the
        detector (ground truth) is put into.  Default: "detections".
    :param model_catids: a list of category labels as provided from Detectron2
        model's metadata.  Used to transform fiftyone category label into an
        index number used by Detectron2

    NOTE: Usually we are more interested in going from Detectron results to
    FiftyOne format, so you might not use this torch Dataset class that much

    refs:

    - https://voxel51.com/docs/fiftyone/user_guide/using_datasets.html
    - https://towardsdatascience.com/stop-wasting-time-with-pytorch-datasets-17cac2c22fa8
    - https://medium.com/voxel51/how-to-train-your-dragon-detector-a35ed4672ca7


    WARNING: at the moment, only detection (not segmentation) is supported
    """

    def __init__(
        self,
        fo_dataset: Dataset = None,
        detection_field="detections",
        # let's use "detections" or "ground-truths" for GT and "predictions" for
        # detectron2-give predictions
        model_catids=[],  # noqa: B006
        # TODO (sampsa) "Do not use mutable data structures for argument defaults"
    ):
        assert fo_dataset is not None, "please provide fo_dataset (fiftyone dataset)"
        assert (
            len(model_catids) > 0
        ), "please provide MODEL's ORIGINAL category label list.  Get his from detectron2 model's metadata."
        self.fo_dataset = fo_dataset
        self.detection_field = detection_field
        self.model_catids = model_catids
        self.img_paths = self.fo_dataset.values(
            "filepath"
        )  # list of all filepaths in the dataset
        # Get list of distinct labels that exist in the view
        # self.classes = self.fo_dataset.distinct(
        #    "%s.detections.label" % detection_field
        # )

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        sample = self.fo_dataset[
            img_path
        ]  # datasets are allowed to be index with ids, filenames, etc. only (not with plain integers)
        # metadata = sample.metadata
        img = Image.open(img_path)

        """Example detectron2 formatted sample:

        ::

            {'file_name': '/home/sampsa/fiftyone/openimagev6_mpeg_vcm_small_COCO/data/001997021f01f208.jpg',
            'height': 1024,
            'width': 760,
            'image_id': 1,
            'annotations': [
                {'iscrowd': 0,
                'bbox': [219.874232, 250.02800128, 58.95175599999998, 74.84913663999998],
                'category_id': 32,
                'bbox_mode': <BoxMode.XYWH_ABS: 1>},
                {'iscrowd': 0,
                'bbox': [430.188652, 410.87401984, 82.85117200000002, 95.55209215999997],
                'category_id': 32,
                'bbox_mode': <BoxMode.XYWH_ABS: 1>}
            ]


        Example fiftyone sample:

        ::

            <Sample: {
                'id': '62e55b386fa654ded87ece8e',
                'media_type': 'image',
                'filepath': '/tmp/compressai-vision/data/83bf8172abed42d7.jpg',
                'tags': BaseList([]),
                'metadata': None,
                'open_images_id': '83bf8172abed42d7',
                'detections':   # can have several members whose class is Detections.  Typically named "ground_truth" or "prediction"
                    # evaluators then use both fields
                    <Detections: {
                    'detections': BaseList([
                        <Detection: {
                            'id': '62e55b386fa654ded87ecda5',
                            'attributes': BaseDict({}),
                            'tags': BaseList([]),
                            'label': 'bird',
                            'bounding_box': BaseList([
                                0.27138644,
                                0.086283185,
                                0.72861356,
                                0.913716815,
                            ]),
                            'mask': None,
                            'confidence': None,
                            'index': None,
                            'IsOccluded': False,
                            'IsTruncated': False,
                            'IsGroupOf': False,
                            'IsDepiction': False,
                            'IsInside': False,
                        }>,
                    ]),
                }>,
            }>

        OpenImageV6 bbox format: xMin, xMax, yMin, yMax (or starting with yMin, depending what tool you use(!)
        https://stackoverflow.com/questions/55832578/how-to-make-sense-of-open-images-datasets-bounding-box-annotations)
        fiftyone bbox format: all relative coordinates: [x0, y0, w, h] (origo at left up)
        Detectron2 bbox format: see: https://detectron2.readthedocs.io/en/latest/modules/structures.html#detectron2.structures.BoxMode
        (OpenImageV6 bbox seems to be equal to BoxMode.XYWH_REL = 3)

        Detectron2 visualizer tool says for BoxMode.XYWH_REAL:

        ::

            AssertionError: Relative mode not yet supported!

        So I deduce that all other than BoxMode.XYWH_ABS are in reality, useless

        """

        d = {
            "file_name": sample.filepath,
            "height": img.height,
            "width": img.width,
            "image_id": sample.id,
        }

        annotations = []
        detections = None
        if sample[self.detection_field] is not None:
            detections = sample[self.detection_field].detections
        if detections is not None:
            for detection in detections:
                x_, y_, w_, h_ = detection.bounding_box
                op = floor
                bbox = [
                    op(x_ * img.width),
                    op(y_ * img.height),
                    op(w_ * img.width),
                    op(h_ * img.height),
                ]
                try:
                    n = self.model_catids.index(detection.label)
                except ValueError:
                    print(
                        "found a label name that is not in the 'model_catids' provided"
                    )
                    raise
                annotations.append(
                    {
                        "iscrowd": 0,
                        "bbox": bbox,
                        # category_id = 0, # from label to catid
                        "category_id": n,
                        # "bbox_mode" : BoxMode.XYWH_REL # does not work
                        "bbox_mode": BoxMode.XYWH_ABS,
                    }
                )

        d["annotations"] = annotations
        return d

    def __len__(self):
        return len(self.img_paths)


[docs]def detectron251(
    res,
    model_catids: list = [],  # noqa: B006
    # TODO (sampsa) "Do not use mutable data structures for argument defaults"
    allowed_labels: list = None,
    verbose=False,
) -> list:
    """Detectron2 formatted results, i.e. ``{'instances': Instances}`` into FiftyOne-formatted results

    This works for detectors and instance segmentation, where a segmentation is always accompanied with a bounding box

    :param res: Detectron2 predictor output (a dictionary ``{'instances': Instances}``)
    :param model_catids: A category label list, as provided by Detectron2 model's metadata

    Returns FiftyOne ``Detections`` instance that can be attached to a FiftyOne ``Sample`` instance.
    """
    assert (
        len(model_catids) > 0
    ), "please provide MODEL's ORIGINAL category label list.  Get it from detectron2 model's metadata."
    """

    Which you would do with:

    ::

        model_dataset=cfg.DATASETS.TRAIN[0]
        model_meta=MetadataCatalog.get(model_dataset)
        model_meta.thing_classes

    """

    instances = res["instances"]

    """
    For example:

    ::

        Instances(num_instances=5, image_height=447, image_width=1024, fields=[pred_boxes: Boxes(tensor([[130.7466,  12.2867, 962.2325, 354.4287],
            [941.1395, 268.9208, 978.8932, 300.9435],
            [891.1142, 275.1706, 942.4617, 299.9345],
            [112.7023, 152.5475, 307.6531, 241.2931],
            [815.8085, 311.5709, 898.5360, 346.5404]])),
            scores: tensor([0.9964, 0.9494, 0.9204, 0.7443, 0.6773]), pred_classes: tensor([4, 7, 7, 4, 7])])

    """
    dets = []
    # all models give scores & pred_classes
    # for bbox, score, pred_class in zip(instances.pred_boxes, instances.scores, instances.pred_classes):
    for i, score in enumerate(instances.scores):
        bbox = None
        mask = None
        class_index = instances[i].pred_classes.detach().item()
        # index to label
        try:
            label = model_catids[class_index]
        except IndexError:
            print(
                "model gave pred_class",
                class_index,
                "but the model_catids provided length is only",
                len(model_catids),
            )
            raise
        if allowed_labels is None:
            pass
        elif label not in allowed_labels:
            if verbose:
                print("detectron251: skipping label", label)
            continue
        # print(bbox.to("cpu").tolist(), score.to("cpu").item())
        height, width = instances.image_size

        """https://voxel51.com/docs/fiftyone/api/fiftyone.core.labels.html#fiftyone.core.labels.Detection
        bbox format: relative (0->1) bbox coordinates: [<top-left-x>, <top-left-y>, <width>, <height>]
        mask: an instance segmentation mask for the detection within its bounding box, which should be a 2D binary or 0/1 integer numpy array
        """
        # bboxes
        if hasattr(instances, "pred_boxes"):
            boxObject = instances.pred_boxes[i]  # indexing returns a Boxes object
            for t in boxObject:  # noqa: B007
                # TODO (sampsa) "Loop control variable 't' not used within the loop body"
                # so annoying.. only way to get the tensor is to iterate
                pass
            x, y, x2, y2 = t.detach().tolist()  # detectron2: abs coordinates
            bbox = [
                x / width,
                y / height,
                (x2 - x) / width,
                (y2 - y) / height,
            ]  # fiftyone: rex, rely, relw, relh
            # print(bbox)
        # segmentation
        if hasattr(instances, "pred_masks"):
            mask = instances.pred_masks[i].cpu().detach().numpy()
            x_ = floor(x)
            y_ = floor(y)
            x2_ = floor(x2)
            y2_ = floor(y2)
            small_mask = mask[y_:y2_, x_:x2_]

        if bbox is not None:
            # TODO: What if we need both metrics when using Mask R-CNN?
            if mask is None:
                dets.append(
                    Detection(
                        label=label, confidence=score.detach().item(), bounding_box=bbox
                    )
                )
            else:  # we have also a mask
                dets.append(
                    Detection(
                        label=label,
                        confidence=score.detach().item(),
                        bounding_box=bbox,
                        mask=small_mask,
                    )
                )

    detections = Detections(detections=dets)
    return detections