roboflow · grzegorz-roboflow · Dec 20, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -205,6 +205,7 @@ def get_roboflow_model_type(
 class ModelEndpointType(Enum):
     ORT = "ort"
     CORE_MODEL = "core_model"
+    OWLV2 = "owlv2"
 
 
 @wrap_roboflow_api_errors()

@@ -1,8 +1,9 @@
 import hashlib
 import os
 import pickle
+import weakref
 from collections import defaultdict
-from typing import Any, Dict, List, Literal, NewType, Tuple, Union
+from typing import Any, Dict, List, Literal, NewType, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -11,6 +12,8 @@
 from transformers import Owlv2ForObjectDetection, Owlv2Processor
 from transformers.models.owlv2.modeling_owlv2 import box_iou
 
+from inference.core.cache.model_artifacts import save_bytes_in_cache
+from inference.core.entities.requests.inference import ObjectDetectionInferenceRequest
 from inference.core.entities.responses.inference import (
     InferenceResponseImage,
     ObjectDetectionInferenceResponse,
@@ -19,15 +22,23 @@
 from inference.core.env import (
     DEVICE,
     MAX_DETECTIONS,
+    MODEL_CACHE_DIR,
     OWLV2_IMAGE_CACHE_SIZE,
     OWLV2_MODEL_CACHE_SIZE,
     OWLV2_VERSION_ID,
 )
+from inference.core.exceptions import ModelArtefactError
 from inference.core.models.roboflow import (
     DEFAULT_COLOR_PALETTE,
     RoboflowCoreModel,
+    RoboflowInferenceModel,
     draw_detection_predictions,
 )
+from inference.core.roboflow_api import (
+    ModelEndpointType,
+    get_from_url,
+    get_roboflow_model_data,
+)
 from inference.core.utils.image_utils import (
     ImageType,
     extract_image_payload_and_type,
@@ -71,6 +82,26 @@ def _check_size_limit(self):
                 self.popitem(last=False)
 
 
+class Owlv2Singleton:
+    _instances = weakref.WeakValueDictionary()
+
+    def __new__(cls, huggingface_id: str):
+        if huggingface_id not in cls._instances:
+            instance = super().__new__(cls)
+            instance.huggingface_id = huggingface_id
+            # Load model directly in the instance
+            model = (
+                Owlv2ForObjectDetection.from_pretrained(huggingface_id)
+                .eval()
+                .to(DEVICE)
+            )
+            torch._dynamo.config.suppress_errors = True
+            model.owlv2.vision_model = torch.compile(model.owlv2.vision_model)
+            instance.model = model
+            cls._instances[huggingface_id] = instance
+        return cls._instances[huggingface_id]
+
+
 def preprocess_image(
     np_image: np.ndarray,
     image_size: Tuple[int, int],
@@ -258,7 +289,7 @@ def hash_wrapped_training_data(wrapped_training_data: List[Dict[str, Any]]) -> H
     return hash_function(pickle.dumps(just_hash_relevant_data))
 
 
-class OwlV2(RoboflowCoreModel):
+class OwlV2(RoboflowInferenceModel):
     task_type = "object-detection"
     box_format = "xywh"
 
@@ -273,21 +304,14 @@ def __init__(self, *args, model_id=f"owlv2/{OWLV2_VERSION_ID}", **kwargs):
         self.image_std = torch.tensor(
             processor.image_processor.image_std, device=DEVICE
         ).view(1, 3, 1, 1)
-        self.model = Owlv2ForObjectDetection.from_pretrained(hf_id).eval().to(DEVICE)
+        self.model = Owlv2Singleton(hf_id).model
         self.reset_cache()
 
-        # compile forward pass of the visual backbone of the model
-        # NOTE that this is able to fix the manual attention implementation used in OWLv2
-        # so we don't have to force in flash attention by ourselves
-        # however that is only true if torch version 2.4 or later is used
-        # for torch < 2.4, this is a LOT slower and using flash attention by ourselves is faster
-        # this also breaks in torch < 2.1 so we supress torch._dynamo errors
-        torch._dynamo.config.suppress_errors = True
-        self.model.owlv2.vision_model = torch.compile(self.model.owlv2.vision_model)
-
     def reset_cache(self):
         # each entry should be on the order of 300*4KB, so 1000 is 400MB of CUDA memory
         self.image_embed_cache = LimitedSizeDict(size_limit=OWLV2_IMAGE_CACHE_SIZE)
+        # no need for limit here, as we're only storing on CPU
+        self.cpu_image_embed_cache = dict()
         # each entry should be on the order of 10 bytes, so 1000 is 10KB
         self.image_size_cache = LimitedSizeDict(size_limit=OWLV2_IMAGE_CACHE_SIZE)
         # entry size will vary depending on the number of samples, but 10 should be safe
@@ -323,6 +347,16 @@ def download_weights(self) -> None:
         # Download from huggingface
         pass
 
+    def get_image_embeds(self, image_hash: Hash) -> Optional[torch.Tensor]:
+        if image_hash in self.image_embed_cache:
+            return self.image_embed_cache[image_hash]
+        elif image_hash in self.cpu_image_embed_cache:
+            tensors = self.cpu_image_embed_cache[image_hash]
+            tensors = tuple(t.to(DEVICE) for t in tensors)
+            return tensors
+        else:
+            return None
+
     def compute_image_size(
         self, image: Union[np.ndarray, LazyImageRetrievalWrapper]
     ) -> Tuple[int, int]:
@@ -342,7 +376,7 @@ def embed_image(self, image: Union[np.ndarray, LazyImageRetrievalWrapper]) -> Ha
         else:
             image_hash = hash_function(image.tobytes())
 
-        if image_hash in self.image_embed_cache:
+        if (image_embeds := self.get_image_embeds(image_hash)) is not None:
             return image_hash
 
         np_image = (
@@ -402,12 +436,10 @@ def get_query_embedding(
         # NOTE: for now we're handling each image seperately
         query_embeds = []
         for image_hash, query_boxes in query_spec.items():
-            try:
-                _objectness, image_boxes, image_class_embeds, _, _ = (
-                    self.image_embed_cache[image_hash]
-                )
-            except KeyError as error:
-                raise KeyError("We didn't embed the image first!") from error
+            image_embeds = self.get_image_embeds(image_hash)
+            if image_embeds is None:
+                raise KeyError("We didn't embed the image first!")
+            _objectness, image_boxes, image_class_embeds, _, _ = image_embeds
 
             query_boxes_tensor = torch.tensor(
                 query_boxes, dtype=image_boxes.dtype, device=image_boxes.device
@@ -438,7 +470,10 @@ def infer_from_embed(
         confidence: float,
         iou_threshold: float,
     ) -> List[Dict]:
-        _, image_boxes, image_class_embeds, _, _ = self.image_embed_cache[image_hash]
+        image_embeds = self.get_image_embeds(image_hash)
+        if image_embeds is None:
+            raise KeyError("We didn't embed the image first!")
+        _, image_boxes, image_class_embeds, _, _ = image_embeds
         class_map, class_names = make_class_map(query_embeddings)
         all_predicted_boxes, all_predicted_classes, all_predicted_scores = [], [], []
         for class_name, pos_neg_embedding_dict in query_embeddings.items():
@@ -494,14 +529,25 @@ def infer(
         self,
         image: Any,
         training_data: Dict,
-        confidence=0.99,
-        iou_threshold=0.3,
+        confidence: float = 0.99,
+        iou_threshold: float = 0.3,
         **kwargs,
     ):
         class_embeddings_dict = self.make_class_embeddings_dict(
             training_data, iou_threshold
         )
+        return self.infer_from_embedding_dict(
+            image, class_embeddings_dict, confidence, iou_threshold
+        )
 
+    def infer_from_embedding_dict(
+        self,
+        image: Any,
+        class_embeddings_dict: Dict[str, PosNegDictType],
+        confidence: float,
+        iou_threshold: float,
+        **kwargs,
+    ):
         if not isinstance(image, list):
             images = [image]
         else:
@@ -526,7 +572,10 @@ def infer(
         )
 
     def make_class_embeddings_dict(
-        self, training_data: List[Any], iou_threshold: float
+        self,
+        training_data: List[Any],
+        iou_threshold: float,
+        return_image_embeds: bool = False,
     ) -> Dict[str, PosNegDictType]:
         wrapped_training_data = [
             {
@@ -547,9 +596,16 @@ def make_class_embeddings_dict(
         class_embeddings_dict = defaultdict(lambda: {"positive": [], "negative": []})
 
         bool_to_literal = {True: "positive", False: "negative"}
+        return_image_embeds_dict = dict()
         for train_image in wrapped_training_data:
             # grab and embed image
             image_hash = self.embed_image(train_image["image"])
+            if return_image_embeds:
+                if (image_embeds := self.get_image_embeds(image_hash)) is None:
+                    raise KeyError("We didn't embed the image first!")
+                return_image_embeds_dict[image_hash] = tuple(
+                    t.to("cpu") for t in image_embeds
+                )
 
             # grab and normalize box prompts for this image
             image_size = self.compute_image_size(train_image["image"])
@@ -586,6 +642,8 @@ def make_class_embeddings_dict(
         }
 
         self.class_embeddings_cache[wrapped_training_data_hash] = class_embeddings_dict
+        if return_image_embeds:
+            return class_embeddings_dict, return_image_embeds_dict
 
         return class_embeddings_dict
 
@@ -614,3 +672,138 @@ def make_response(self, predictions, image_sizes, class_names):
             for ind, batch_predictions in enumerate(predictions)
         ]
         return responses
+
+
+class SerializedOwlV2(RoboflowInferenceModel):
+    task_type = "object-detection"
+    box_format = "xywh"
+
+    @classmethod
+    def serialize_training_data(
+        cls,
+        training_data: List[Any],
+        hf_id: str = f"google/{OWLV2_VERSION_ID}",
+        iou_threshold: float = 0.3,
+        save_dir: str = os.path.join(MODEL_CACHE_DIR, "owl-v2-serialized-data"),
+    ):
+        roboflow_id = hf_id.replace("google/", "owlv2/")
+        owlv2 = OwlV2(model_id=roboflow_id)
+        train_data_dict, image_embeds = owlv2.make_class_embeddings_dict(
+            training_data, iou_threshold, return_image_embeds=True
+        )
+
+        return cls.save_model(
+            hf_id, roboflow_id, train_data_dict, image_embeds, save_dir
+        )
+
+    @classmethod
+    def save_model(
+        cls,
+        hf_id: str,
+        roboflow_id: str,
+        train_data_dict: Dict,
+        image_embeds: Dict,
+        save_dir: str,
+    ):
+        train_data_dict = {
+            "huggingface_id": hf_id,
+            "train_data_dict": train_data_dict,
+            "class_names": list(train_data_dict.keys()),
+            "roboflow_id": roboflow_id,
+            "image_embeds": image_embeds,
+        }
+        train_data_path = os.path.join(save_dir, cls.weights_file_path)
+        os.makedirs(save_dir, exist_ok=True)
+        torch.save(train_data_dict, train_data_path)
+        return train_data_path
+
+    def infer_from_request(
+        self,
+        request: ObjectDetectionInferenceRequest,
+    ) -> Union[
+        List[ObjectDetectionInferenceResponse], ObjectDetectionInferenceResponse
+    ]:
+        return super().infer_from_request(request)
+
+    def __init__(self, model_id, *args, **kwargs):
+        super().__init__(model_id, *args, **kwargs)
+        self.get_model_artifacts()
+
+    def get_infer_bucket_file_list(self):
+        return []
+
+    def download_model_artefacts_from_s3(self):
+        raise NotImplementedError("Owlv2 not currently supported on hosted inference")
+
+    def download_model_artifacts_from_roboflow_api(self):
+        api_data = get_roboflow_model_data(
+            api_key=self.api_key,
+            model_id=self.endpoint,
+            endpoint_type=ModelEndpointType.OWLV2,
+            device_id=self.device_id,
+        )
+        api_data = api_data["owlv2"]
+        if "model" not in api_data:
+            raise ModelArtefactError(
+                "Could not find `model` key in roboflow API model description response."
+            )
+        model_weights_response = get_from_url(api_data["model"], json_response=False)
+        save_bytes_in_cache(
+            content=model_weights_response.content,
+            file=self.weights_file,
+            model_id=self.endpoint,
+        )
+
+    def load_model_artifacts_from_cache(self):
+        if DEVICE == "cpu":
+            self.model_data = torch.load(
+                self.cache_file(self.weights_file), map_location="cpu"
+            )
+        else:
+            self.model_data = torch.load(self.cache_file(self.weights_file))
+        self.class_names = self.model_data["class_names"]
+        self.train_data_dict = self.model_data["train_data_dict"]
+        self.huggingface_id = self.model_data["huggingface_id"]
+        self.roboflow_id = self.model_data["roboflow_id"]
+        # each model can have its own OwlV2 instance because we use a singleton
+        self.owlv2 = OwlV2(model_id=self.roboflow_id)
+        self.owlv2.cpu_image_embed_cache = self.model_data["image_embeds"]
+
+    weights_file_path = "weights.pt"
+
+    @property
+    def weights_file(self):
+        return self.weights_file_path
+
+    def infer(
+        self, image, confidence: float = 0.99, iou_threshold: float = 0.3, **kwargs
+    ):
+        return self.owlv2.infer_from_embedding_dict(
+            image,
+            self.train_data_dict,
+            confidence=confidence,
+            iou_threshold=iou_threshold,
+            **kwargs,
+        )
+
+    def draw_predictions(
+        self,
+        inference_request: ObjectDetectionInferenceRequest,
+        inference_response: ObjectDetectionInferenceResponse,
+    ):
+        return self.owlv2.draw_predictions(
+            inference_request,
+            inference_response,
+        )
+
+    def save_small_model_without_image_embeds(
+        self, save_dir: str = os.path.join(MODEL_CACHE_DIR, "owl-v2-serialized-data")
+    ):
+        self.owlv2.cpu_image_embed_cache = dict()
+        return self.save_model(
+            self.huggingface_id,
+            self.roboflow_id,
+            self.train_data_dict,
+            self.owlv2.cpu_image_embed_cache,
+            save_dir,
+        )
@@ -314,9 +314,10 @@
     pass
 
 try:
-    from inference.models.owlv2.owlv2 import OwlV2
+    from inference.models.owlv2.owlv2 import OwlV2, SerializedOwlV2
 
     ROBOFLOW_MODEL_TYPES[("object-detection", "owlv2")] = OwlV2
+    ROBOFLOW_MODEL_TYPES[("object-detection", "owlv2-finetuned")] = SerializedOwlV2
 except:
     pass