Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serialized owlv2 model #889

Merged
merged 23 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions inference/core/roboflow_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def get_roboflow_model_type(
class ModelEndpointType(Enum):
ORT = "ort"
CORE_MODEL = "core_model"
OWLV2 = "owlv2"


@wrap_roboflow_api_errors()
Expand Down
239 changes: 216 additions & 23 deletions inference/models/owlv2/owlv2.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import hashlib
import os
import pickle
import weakref
from collections import defaultdict
from typing import Any, Dict, List, Literal, NewType, Tuple, Union
from typing import Any, Dict, List, Literal, NewType, Optional, Tuple, Union

import numpy as np
import torch
Expand All @@ -11,6 +12,8 @@
from transformers import Owlv2ForObjectDetection, Owlv2Processor
from transformers.models.owlv2.modeling_owlv2 import box_iou

from inference.core.cache.model_artifacts import save_bytes_in_cache
from inference.core.entities.requests.inference import ObjectDetectionInferenceRequest
from inference.core.entities.responses.inference import (
InferenceResponseImage,
ObjectDetectionInferenceResponse,
Expand All @@ -19,15 +22,23 @@
from inference.core.env import (
DEVICE,
MAX_DETECTIONS,
MODEL_CACHE_DIR,
OWLV2_IMAGE_CACHE_SIZE,
OWLV2_MODEL_CACHE_SIZE,
OWLV2_VERSION_ID,
)
from inference.core.exceptions import ModelArtefactError
from inference.core.models.roboflow import (
DEFAULT_COLOR_PALETTE,
RoboflowCoreModel,
RoboflowInferenceModel,
draw_detection_predictions,
)
from inference.core.roboflow_api import (
ModelEndpointType,
get_from_url,
get_roboflow_model_data,
)
from inference.core.utils.image_utils import (
ImageType,
extract_image_payload_and_type,
Expand Down Expand Up @@ -71,6 +82,26 @@ def _check_size_limit(self):
self.popitem(last=False)


class Owlv2Singleton:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this can result in RAM exhaustion on long running processes that request multiple distinct models.

_instances = weakref.WeakValueDictionary()

def __new__(cls, huggingface_id: str):
if huggingface_id not in cls._instances:
instance = super().__new__(cls)
instance.huggingface_id = huggingface_id
# Load model directly in the instance
model = (
Owlv2ForObjectDetection.from_pretrained(huggingface_id)
.eval()
.to(DEVICE)
)
torch._dynamo.config.suppress_errors = True
model.owlv2.vision_model = torch.compile(model.owlv2.vision_model)
instance.model = model
cls._instances[huggingface_id] = instance
return cls._instances[huggingface_id]


def preprocess_image(
np_image: np.ndarray,
image_size: Tuple[int, int],
Expand Down Expand Up @@ -258,7 +289,7 @@ def hash_wrapped_training_data(wrapped_training_data: List[Dict[str, Any]]) -> H
return hash_function(pickle.dumps(just_hash_relevant_data))


class OwlV2(RoboflowCoreModel):
class OwlV2(RoboflowInferenceModel):
task_type = "object-detection"
box_format = "xywh"

Expand All @@ -273,21 +304,14 @@ def __init__(self, *args, model_id=f"owlv2/{OWLV2_VERSION_ID}", **kwargs):
self.image_std = torch.tensor(
processor.image_processor.image_std, device=DEVICE
).view(1, 3, 1, 1)
self.model = Owlv2ForObjectDetection.from_pretrained(hf_id).eval().to(DEVICE)
self.model = Owlv2Singleton(hf_id).model
self.reset_cache()

# compile forward pass of the visual backbone of the model
# NOTE that this is able to fix the manual attention implementation used in OWLv2
# so we don't have to force in flash attention by ourselves
# however that is only true if torch version 2.4 or later is used
# for torch < 2.4, this is a LOT slower and using flash attention by ourselves is faster
# this also breaks in torch < 2.1 so we supress torch._dynamo errors
torch._dynamo.config.suppress_errors = True
self.model.owlv2.vision_model = torch.compile(self.model.owlv2.vision_model)

def reset_cache(self):
# each entry should be on the order of 300*4KB, so 1000 is 400MB of CUDA memory
self.image_embed_cache = LimitedSizeDict(size_limit=OWLV2_IMAGE_CACHE_SIZE)
# no need for limit here, as we're only storing on CPU
self.cpu_image_embed_cache = dict()
# each entry should be on the order of 10 bytes, so 1000 is 10KB
self.image_size_cache = LimitedSizeDict(size_limit=OWLV2_IMAGE_CACHE_SIZE)
# entry size will vary depending on the number of samples, but 10 should be safe
Expand Down Expand Up @@ -323,6 +347,16 @@ def download_weights(self) -> None:
# Download from huggingface
pass

def get_image_embeds(self, image_hash: Hash) -> Optional[torch.Tensor]:
if image_hash in self.image_embed_cache:
return self.image_embed_cache[image_hash]
elif image_hash in self.cpu_image_embed_cache:
tensors = self.cpu_image_embed_cache[image_hash]
tensors = tuple(t.to(DEVICE) for t in tensors)
return tensors
else:
return None

def compute_image_size(
self, image: Union[np.ndarray, LazyImageRetrievalWrapper]
) -> Tuple[int, int]:
Expand All @@ -342,7 +376,7 @@ def embed_image(self, image: Union[np.ndarray, LazyImageRetrievalWrapper]) -> Ha
else:
image_hash = hash_function(image.tobytes())

if image_hash in self.image_embed_cache:
if (image_embeds := self.get_image_embeds(image_hash)) is not None:
return image_hash

np_image = (
Expand Down Expand Up @@ -402,12 +436,10 @@ def get_query_embedding(
# NOTE: for now we're handling each image seperately
query_embeds = []
for image_hash, query_boxes in query_spec.items():
try:
_objectness, image_boxes, image_class_embeds, _, _ = (
self.image_embed_cache[image_hash]
)
except KeyError as error:
raise KeyError("We didn't embed the image first!") from error
image_embeds = self.get_image_embeds(image_hash)
if image_embeds is None:
raise KeyError("We didn't embed the image first!")
_objectness, image_boxes, image_class_embeds, _, _ = image_embeds

query_boxes_tensor = torch.tensor(
query_boxes, dtype=image_boxes.dtype, device=image_boxes.device
Expand Down Expand Up @@ -438,7 +470,10 @@ def infer_from_embed(
confidence: float,
iou_threshold: float,
) -> List[Dict]:
_, image_boxes, image_class_embeds, _, _ = self.image_embed_cache[image_hash]
image_embeds = self.get_image_embeds(image_hash)
if image_embeds is None:
raise KeyError("We didn't embed the image first!")
_, image_boxes, image_class_embeds, _, _ = image_embeds
class_map, class_names = make_class_map(query_embeddings)
all_predicted_boxes, all_predicted_classes, all_predicted_scores = [], [], []
for class_name, pos_neg_embedding_dict in query_embeddings.items():
Expand Down Expand Up @@ -494,14 +529,25 @@ def infer(
self,
image: Any,
training_data: Dict,
confidence=0.99,
iou_threshold=0.3,
confidence: float = 0.99,
iou_threshold: float = 0.3,
**kwargs,
):
class_embeddings_dict = self.make_class_embeddings_dict(
training_data, iou_threshold
)
return self.infer_from_embedding_dict(
image, class_embeddings_dict, confidence, iou_threshold
)

def infer_from_embedding_dict(
self,
image: Any,
class_embeddings_dict: Dict[str, PosNegDictType],
confidence: float,
iou_threshold: float,
**kwargs,
):
if not isinstance(image, list):
images = [image]
else:
Expand All @@ -526,7 +572,10 @@ def infer(
)

def make_class_embeddings_dict(
self, training_data: List[Any], iou_threshold: float
self,
training_data: List[Any],
iou_threshold: float,
return_image_embeds: bool = False,
) -> Dict[str, PosNegDictType]:
wrapped_training_data = [
{
Expand All @@ -547,9 +596,16 @@ def make_class_embeddings_dict(
class_embeddings_dict = defaultdict(lambda: {"positive": [], "negative": []})

bool_to_literal = {True: "positive", False: "negative"}
return_image_embeds_dict = dict()
for train_image in wrapped_training_data:
# grab and embed image
image_hash = self.embed_image(train_image["image"])
if return_image_embeds:
if (image_embeds := self.get_image_embeds(image_hash)) is None:
raise KeyError("We didn't embed the image first!")
return_image_embeds_dict[image_hash] = tuple(
t.to("cpu") for t in image_embeds
)

# grab and normalize box prompts for this image
image_size = self.compute_image_size(train_image["image"])
Expand Down Expand Up @@ -586,6 +642,8 @@ def make_class_embeddings_dict(
}

self.class_embeddings_cache[wrapped_training_data_hash] = class_embeddings_dict
if return_image_embeds:
return class_embeddings_dict, return_image_embeds_dict

return class_embeddings_dict

Expand Down Expand Up @@ -614,3 +672,138 @@ def make_response(self, predictions, image_sizes, class_names):
for ind, batch_predictions in enumerate(predictions)
]
return responses


class SerializedOwlV2(RoboflowInferenceModel):
task_type = "object-detection"
box_format = "xywh"

@classmethod
def serialize_training_data(
probicheaux marked this conversation as resolved.
Show resolved Hide resolved
cls,
training_data: List[Any],
hf_id: str = f"google/{OWLV2_VERSION_ID}",
iou_threshold: float = 0.3,
save_dir: str = os.path.join(MODEL_CACHE_DIR, "owl-v2-serialized-data"),
):
roboflow_id = hf_id.replace("google/", "owlv2/")
owlv2 = OwlV2(model_id=roboflow_id)
train_data_dict, image_embeds = owlv2.make_class_embeddings_dict(
training_data, iou_threshold, return_image_embeds=True
)

return cls.save_model(
hf_id, roboflow_id, train_data_dict, image_embeds, save_dir
)

@classmethod
def save_model(
cls,
hf_id: str,
roboflow_id: str,
train_data_dict: Dict,
image_embeds: Dict,
save_dir: str,
):
train_data_dict = {
"huggingface_id": hf_id,
"train_data_dict": train_data_dict,
"class_names": list(train_data_dict.keys()),
"roboflow_id": roboflow_id,
"image_embeds": image_embeds,
}
train_data_path = os.path.join(save_dir, cls.weights_file_path)
os.makedirs(save_dir, exist_ok=True)
torch.save(train_data_dict, train_data_path)
return train_data_path

def infer_from_request(
self,
request: ObjectDetectionInferenceRequest,
) -> Union[
List[ObjectDetectionInferenceResponse], ObjectDetectionInferenceResponse
]:
return super().infer_from_request(request)

def __init__(self, model_id, *args, **kwargs):
super().__init__(model_id, *args, **kwargs)
self.get_model_artifacts()

def get_infer_bucket_file_list(self):
return []

def download_model_artefacts_from_s3(self):
raise NotImplementedError("Owlv2 not currently supported on hosted inference")

def download_model_artifacts_from_roboflow_api(self):
api_data = get_roboflow_model_data(
api_key=self.api_key,
model_id=self.endpoint,
endpoint_type=ModelEndpointType.OWLV2,
device_id=self.device_id,
)
api_data = api_data["owlv2"]
if "model" not in api_data:
raise ModelArtefactError(
"Could not find `model` key in roboflow API model description response."
)
model_weights_response = get_from_url(api_data["model"], json_response=False)
save_bytes_in_cache(
content=model_weights_response.content,
file=self.weights_file,
model_id=self.endpoint,
)

def load_model_artifacts_from_cache(self):
if DEVICE == "cpu":
self.model_data = torch.load(
self.cache_file(self.weights_file), map_location="cpu"
)
else:
self.model_data = torch.load(self.cache_file(self.weights_file))
self.class_names = self.model_data["class_names"]
self.train_data_dict = self.model_data["train_data_dict"]
self.huggingface_id = self.model_data["huggingface_id"]
self.roboflow_id = self.model_data["roboflow_id"]
# each model can have its own OwlV2 instance because we use a singleton
self.owlv2 = OwlV2(model_id=self.roboflow_id)
self.owlv2.cpu_image_embed_cache = self.model_data["image_embeds"]

weights_file_path = "weights.pt"

@property
def weights_file(self):
return self.weights_file_path

def infer(
self, image, confidence: float = 0.99, iou_threshold: float = 0.3, **kwargs
):
return self.owlv2.infer_from_embedding_dict(
image,
self.train_data_dict,
confidence=confidence,
iou_threshold=iou_threshold,
**kwargs,
)

def draw_predictions(
self,
inference_request: ObjectDetectionInferenceRequest,
inference_response: ObjectDetectionInferenceResponse,
):
return self.owlv2.draw_predictions(
inference_request,
inference_response,
)

def save_small_model_without_image_embeds(
self, save_dir: str = os.path.join(MODEL_CACHE_DIR, "owl-v2-serialized-data")
):
self.owlv2.cpu_image_embed_cache = dict()
return self.save_model(
self.huggingface_id,
self.roboflow_id,
self.train_data_dict,
self.owlv2.cpu_image_embed_cache,
save_dir,
)
3 changes: 2 additions & 1 deletion inference/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,10 @@
pass

try:
from inference.models.owlv2.owlv2 import OwlV2
from inference.models.owlv2.owlv2 import OwlV2, SerializedOwlV2

ROBOFLOW_MODEL_TYPES[("object-detection", "owlv2")] = OwlV2
ROBOFLOW_MODEL_TYPES[("object-detection", "owlv2-finetuned")] = SerializedOwlV2
except:
pass

Expand Down
Loading
Loading