feat(ml): model unloading (#2661)

* model cache * fixed revalidation when using cache namespace * fixed ttl not being set, added lock
2026-05-12 08:25:48 +00:00 · 2023-06-06 21:48:51 -04:00
parent 6ce35d47f5
commit d0cc231782
5 changed files with 400 additions and 233 deletions
--- a/machine-learning/app/cache.py
+++ b/machine-learning/app/cache.py
@@ -0,0 +1,84 @@
+from aiocache.plugins import TimingPlugin, BasePlugin
+from aiocache.backends.memory import SimpleMemoryCache
+from aiocache.lock import OptimisticLock
+from typing import Any
+from models import get_model
+
+
+class ModelCache:
+    """Fetches a model from an in-memory cache, instantiating it if it's missing."""
+
+    def __init__(
+        self,
+        ttl: int | None = None,
+        revalidate: bool = False,
+        timeout: int | None = None,
+        profiling: bool = False,
+    ):
+        """
+        Args:
+            ttl: Unloads model after this duration. Disabled if None. Defaults to None.
+            revalidate: Resets TTL on cache hit. Useful to keep models in memory while active. Defaults to False.
+            timeout: Maximum allowed time for model to load. Disabled if None. Defaults to None.
+            profiling: Collects metrics for cache operations, adding slight overhead. Defaults to False.
+        """
+
+        self.ttl = ttl
+        plugins = []
+
+        if revalidate:
+            plugins.append(RevalidationPlugin())
+        if profiling:
+            plugins.append(TimingPlugin())
+
+        self.cache = SimpleMemoryCache(
+            ttl=ttl, timeout=timeout, plugins=plugins, namespace=None
+        )
+
+    async def get_cached_model(
+        self, model_name: str, model_type: str, **model_kwargs
+    ) -> Any:
+        """
+        Args:
+            model_name: Name of model in the model hub used for the task.
+            model_type: Model type or task, which determines which model zoo is used.
+
+        Returns:
+            model: The requested model.
+        """
+
+        key = self.cache.build_key(model_name, model_type)
+        model = await self.cache.get(key)
+        if model is None:
+            async with OptimisticLock(self.cache, key) as lock:
+                model = get_model(model_name, model_type, **model_kwargs)
+                await lock.cas(model, ttl=self.ttl)
+        return model
+
+    async def get_profiling(self) -> dict[str, float] | None:
+        if not hasattr(self.cache, "profiling"):
+            return None
+
+        return self.cache.profiling  # type: ignore
+
+
+class RevalidationPlugin(BasePlugin):
+    """Revalidates cache item's TTL after cache hit."""
+
+    async def post_get(self, client, key, ret=None, namespace=None, **kwargs):
+        if ret is None:
+            return
+        if namespace is not None:
+            key = client.build_key(key, namespace)
+        if key in client._handlers:
+            await client.expire(key, client.ttl)
+
+    async def post_multi_get(self, client, keys, ret=None, namespace=None, **kwargs):
+        if ret is None:
+            return
+
+        for key, val in zip(keys, ret):
+            if namespace is not None:
+                key = client.build_key(key, namespace)
+            if val is not None and key in client._handlers:
+                await client.expire(key, client.ttl)
--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@@ -1,5 +1,7 @@
 import os
 from typing import Any
+
+from cache import ModelCache
 from schemas import (
    EmbeddingResponse,
    FaceResponse,
@@ -9,16 +11,11 @@ from schemas import (
    TextResponse,
    VisionModelRequest,
 )
-import cv2 as cv
 import uvicorn

-from insightface.app import FaceAnalysis
-from transformers import pipeline
-from sentence_transformers import SentenceTransformer
-from transformers import Pipeline
 from PIL import Image
-from fastapi import FastAPI
-
+from fastapi import FastAPI, HTTPException
+from models import get_model, run_classification, run_facial_recognition

 classification_model = os.getenv(
    "MACHINE_LEARNING_CLASSIFICATION_MODEL", "microsoft/resnet-50"
@@ -29,21 +26,20 @@ facial_recognition_model = os.getenv(
    "MACHINE_LEARNING_FACIAL_RECOGNITION_MODEL", "buffalo_l"
 )

-min_face_score = float(os.getenv("MACHINE_LEARNING_MIN_FACE_SCORE", 0.7))
 min_tag_score = float(os.getenv("MACHINE_LEARNING_MIN_TAG_SCORE", 0.9))
 eager_startup = (
    os.getenv("MACHINE_LEARNING_EAGER_STARTUP", "true") == "true"
 )  # loads all models at startup
+model_ttl = int(os.getenv("MACHINE_LEARNING_MODEL_TTL", 300))

-cache_folder = os.getenv("MACHINE_LEARNING_CACHE_FOLDER", "/cache")
-
-_model_cache = {}
-
+_model_cache = None
 app = FastAPI()


@app.on_event("startup")
 async def startup_event() -> None:
+    global _model_cache
+    _model_cache = ModelCache(ttl=model_ttl, revalidate=True)
    models = [
        (classification_model, "image-classification"),
        (clip_image_model, "clip"),
@@ -54,9 +50,9 @@ async def startup_event() -> None:
    # Get all models
    for model_name, model_type in models:
        if eager_startup:
-            get_cached_model(model_name, model_type)
+            await _model_cache.get_cached_model(model_name, model_type)
        else:
-            _get_model(model_name, model_type)
+            get_model(model_name, model_type)


@app.get("/", response_model=MessageResponse)
@@ -70,10 +66,14 @@ def ping() -> str:


@app.post("/image-classifier/tag-image", response_model=TagResponse, status_code=200)
-def image_classification(payload: VisionModelRequest) -> list[str]:
-    model = get_cached_model(classification_model, "image-classification")
-    assetPath = payload.image_path
-    labels = run_engine(model, assetPath)
+async def image_classification(payload: VisionModelRequest) -> list[str]:
+    if _model_cache is None:
+        raise HTTPException(status_code=500, detail="Unable to load model.")
+
+    model = await _model_cache.get_cached_model(
+        classification_model, "image-classification"
+    )
+    labels = run_classification(model, payload.image_path, min_tag_score)
    return labels


@@ -82,10 +82,14 @@ def image_classification(payload: VisionModelRequest) -> list[str]:
    response_model=EmbeddingResponse,
    status_code=200,
 )
-def clip_encode_image(payload: VisionModelRequest) -> list[float]:
-    model = get_cached_model(clip_image_model, "clip")
+async def clip_encode_image(payload: VisionModelRequest) -> list[float]:
+    if _model_cache is None:
+        raise HTTPException(status_code=500, detail="Unable to load model.")
+
+    model = await _model_cache.get_cached_model(clip_image_model, "clip")
    image = Image.open(payload.image_path)
-    return model.encode(image).tolist()
+    embedding = model.encode(image).tolist()
+    return embedding


@app.post(
@@ -93,82 +97,27 @@ def clip_encode_image(payload: VisionModelRequest) -> list[float]:
    response_model=EmbeddingResponse,
    status_code=200,
 )
-def clip_encode_text(payload: TextModelRequest) -> list[float]:
-    model = get_cached_model(clip_text_model, "clip")
-    return model.encode(payload.text).tolist()
+async def clip_encode_text(payload: TextModelRequest) -> list[float]:
+    if _model_cache is None:
+        raise HTTPException(status_code=500, detail="Unable to load model.")
+
+    model = await _model_cache.get_cached_model(clip_text_model, "clip")
+    embedding = model.encode(payload.text).tolist()
+    return embedding


@app.post(
    "/facial-recognition/detect-faces", response_model=FaceResponse, status_code=200
 )
-def facial_recognition(payload: VisionModelRequest) -> list[dict[str, Any]]:
-    model = get_cached_model(facial_recognition_model, "facial-recognition")
-    img = cv.imread(payload.image_path)
-    height, width, _ = img.shape
-    results = []
-    faces = model.get(img)
+async def facial_recognition(payload: VisionModelRequest) -> list[dict[str, Any]]:
+    if _model_cache is None:
+        raise HTTPException(status_code=500, detail="Unable to load model.")

-    for face in faces:
-        if face.det_score < min_face_score:
-            continue
-        x1, y1, x2, y2 = face.bbox
-
-        results.append(
-            {
-                "imageWidth": width,
-                "imageHeight": height,
-                "boundingBox": {
-                    "x1": round(x1),
-                    "y1": round(y1),
-                    "x2": round(x2),
-                    "y2": round(y2),
-                },
-                "score": face.det_score.item(),
-                "embedding": face.normed_embedding.tolist(),
-            }
-        )
-    return results
-
-
-def run_engine(engine: Pipeline, path: str) -> list[str]:
-    result: list[str] = []
-    predictions: list[dict[str, Any]] = engine(path)  # type: ignore
-
-    for pred in predictions:
-        tags = pred["label"].split(", ")
-        if pred["score"] > min_tag_score:
-            result = [*result, *tags]
-
-    if len(result) > 1:
-        result = list(set(result))
-
-    return result
-
-
-def get_cached_model(model, task) -> Any:
-    global _model_cache
-    key = "|".join([model, str(task)])
-    if key not in _model_cache:
-        model = _get_model(model, task)
-        _model_cache[key] = model
-
-    return _model_cache[key]
-
-
-def _get_model(model, task) -> Any:
-    match task:
-        case "facial-recognition":
-            model = FaceAnalysis(
-                name=model,
-                root=cache_folder,
-                allowed_modules=["detection", "recognition"],
-            )
-            model.prepare(ctx_id=0, det_size=(640, 640))
-        case "clip":
-            model = SentenceTransformer(model, cache_folder=cache_folder)
-        case _:
-            model = pipeline(model=model, task=task)
-    return model
+    model = await _model_cache.get_cached_model(
+        facial_recognition_model, "facial-recognition"
+    )
+    faces = run_facial_recognition(model, payload.image_path)
+    return faces


 if __name__ == "__main__":
--- a/machine-learning/app/models.py
+++ b/machine-learning/app/models.py
@@ -0,0 +1,117 @@
+import torch
+from insightface.app import FaceAnalysis
+from pathlib import Path
+import os
+
+from transformers import pipeline, Pipeline
+from sentence_transformers import SentenceTransformer
+from typing import Any
+import cv2 as cv
+
+cache_folder = os.getenv("MACHINE_LEARNING_CACHE_FOLDER", "/cache")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def get_model(model_name: str, model_type: str, **model_kwargs):
+    """
+    Instantiates the specified model.
+
+    Args:
+        model_name: Name of model in the model hub used for the task.
+        model_type: Model type or task, which determines which model zoo is used.
+            `facial-recognition` uses Insightface, while all other models use the HF Model Hub.
+
+            Options:
+                `image-classification`, `clip`,`facial-recognition`, `tokenizer`, `processor`
+
+    Returns:
+        model: The requested model.
+    """
+
+    cache_dir = _get_cache_dir(model_name, model_type)
+    match model_type:
+        case "facial-recognition":
+            model = _load_facial_recognition(
+                model_name, cache_dir=cache_dir, **model_kwargs
+            )
+        case "clip":
+            model = SentenceTransformer(
+                model_name, cache_folder=cache_dir, **model_kwargs
+            )
+        case _:
+            model = pipeline(
+                model_type,
+                model_name,
+                model_kwargs={"cache_dir": cache_dir, **model_kwargs},
+            )
+
+    return model
+
+
+def run_classification(
+    model: Pipeline, image_path: str, min_score: float | None = None
+):
+    predictions: list[dict[str, Any]] = model(image_path)  # type: ignore
+    result = {
+        tag
+        for pred in predictions
+        for tag in pred["label"].split(", ")
+        if min_score is None or pred["score"] >= min_score
+    }
+
+    return list(result)
+
+
+def run_facial_recognition(
+    model: FaceAnalysis, image_path: str
+) -> list[dict[str, Any]]:
+    img = cv.imread(image_path)
+    height, width, _ = img.shape
+    results = []
+    faces = model.get(img)
+
+    for face in faces:
+        x1, y1, x2, y2 = face.bbox
+
+        results.append(
+            {
+                "imageWidth": width,
+                "imageHeight": height,
+                "boundingBox": {
+                    "x1": round(x1),
+                    "y1": round(y1),
+                    "x2": round(x2),
+                    "y2": round(y2),
+                },
+                "score": face.det_score.item(),
+                "embedding": face.normed_embedding.tolist(),
+            }
+        )
+    return results
+
+
+def _load_facial_recognition(
+    model_name: str,
+    min_face_score: float | None = None,
+    cache_dir: Path | str | None = None,
+    **model_kwargs,
+):
+    if cache_dir is None:
+        cache_dir = _get_cache_dir(model_name, "facial-recognition")
+    if isinstance(cache_dir, Path):
+        cache_dir = cache_dir.as_posix()
+    if min_face_score is None:
+        min_face_score = float(os.getenv("MACHINE_LEARNING_MIN_FACE_SCORE", 0.7))
+
+    model = FaceAnalysis(
+        name=model_name,
+        root=cache_dir,
+        allowed_modules=["detection", "recognition"],
+        **model_kwargs,
+    )
+    model.prepare(ctx_id=0, det_thresh=min_face_score, det_size=(640, 640))
+    return model
+
+
+def _get_cache_dir(model_name: str, model_type: str) -> Path:
+    return Path(cache_folder, device, model_type, model_name)