segment-anything-with-clip

Runtime error

App Files Files Community

Govind K

curt-park commited on Apr 6, 2023

Commit

1d1c937

0 Parent(s):

Duplicate from curt-park/segment-anything-with-clip

Browse files

Co-authored-by: Jinwoo Park <[email protected]>

Files changed (12) hide show

.gitattributes +34 -0
.gitignore +2 -0
Makefile +8 -0
README.md +14 -0
ViT-B-32.pt +3 -0
app.py +193 -0
examples/city.jpg +0 -0
examples/dog.jpg +0 -0
examples/food.jpg +0 -0
examples/horse.jpg +0 -0
requirements.txt +6 -0
sam_vit_h_4b8939.pth +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ flagged

Makefile ADDED Viewed

	@@ -0,0 +1,8 @@

+env:
+	conda create -n segment-anything python=3.9
+setup:
+	pip install -r requirements.txt
+run:
+	gradio app.py

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Segment Anything
+emoji: 🐠
+colorFrom: green
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.24.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: curt-park/segment-anything-with-clip
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

ViT-B-32.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af
+size 353976522

app.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import os
+from functools import lru_cache
+from random import randint
+from typing import Any, Callable, Dict, List, Tuple
+import clip
+import cv2
+import gradio as gr
+import numpy as np
+import PIL
+import torch
+from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
+CHECKPOINT_PATH = "sam_vit_h_4b8939.pth"
+MODEL_TYPE = "default"
+MAX_WIDTH = MAX_HEIGHT = 800
+CLIP_WIDTH = CLIP_HEIGHT = 300
+THRESHOLD = 0.05
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+@lru_cache
+def load_mask_generator() -> SamAutomaticMaskGenerator:
+    sam = sam_model_registry[MODEL_TYPE](checkpoint=CHECKPOINT_PATH).to(device)
+    mask_generator = SamAutomaticMaskGenerator(sam)
+    return mask_generator
+@lru_cache
+def load_clip(
+    name: str = "ViT-B-32.pt",
+) -> Tuple[torch.nn.Module, Callable[[PIL.Image.Image], torch.Tensor]]:
+    model_path = os.path.join(".", name)
+    model, preprocess = clip.load(model_path, device=device)
+    return model.to(device), preprocess
+def adjust_image_size(image: np.ndarray) -> np.ndarray:
+    height, width = image.shape[:2]
+    if height > width:
+        if height > MAX_HEIGHT:
+            height, width = MAX_HEIGHT, int(MAX_HEIGHT / height * width)
+    else:
+        if width > MAX_WIDTH:
+            height, width = int(MAX_WIDTH / width * height), MAX_WIDTH
+    image = cv2.resize(image, (width, height))
+    return image
+@torch.no_grad()
+def get_scores(crops: List[PIL.Image.Image], query: str) -> torch.Tensor:
+    model, preprocess = load_clip()
+    preprocessed = [preprocess(crop) for crop in crops]
+    preprocessed = torch.stack(preprocessed).to(device)
+    token = clip.tokenize(query).to(device)
+    img_features = model.encode_image(preprocessed)
+    txt_features = model.encode_text(token)
+    img_features /= img_features.norm(dim=-1, keepdim=True)
+    txt_features /= txt_features.norm(dim=-1, keepdim=True)
+    similarity = (100.0 * img_features @ txt_features.T).softmax(dim=0)
+    return similarity
+def filter_masks(
+    image: np.ndarray,
+    masks: List[Dict[str, Any]],
+    predicted_iou_threshold: float,
+    stability_score_threshold: float,
+    query: str,
+    clip_threshold: float,
+) -> List[Dict[str, Any]]:
+    cropped_masks: List[PIL.Image.Image] = []
+    filtered_masks: List[Dict[str, Any]] = []
+    for mask in masks:
+        if (
+            mask["predicted_iou"] < predicted_iou_threshold
+            or mask["stability_score"] < stability_score_threshold
+        ):
+            continue
+        filtered_masks.append(mask)
+        x, y, w, h = mask["bbox"]
+        crop = image[y: y + h, x: x + w]
+        crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
+        crop = PIL.Image.fromarray(np.uint8(crop * 255)).convert("RGB")
+        crop.resize((CLIP_WIDTH, CLIP_HEIGHT))
+        cropped_masks.append(crop)
+    if query and filtered_masks:
+        scores = get_scores(cropped_masks, query)
+        filtered_masks = [
+            filtered_masks[i]
+            for i, score in enumerate(scores)
+            if score > clip_threshold
+        ]
+    return filtered_masks
+def draw_masks(
+    image: np.ndarray, masks: List[np.ndarray], alpha: float = 0.7
+) -> np.ndarray:
+    for mask in masks:
+        color = [randint(127, 255) for _ in range(3)]
+        # draw mask overlay
+        colored_mask = np.expand_dims(mask["segmentation"], 0).repeat(3, axis=0)
+        colored_mask = np.moveaxis(colored_mask, 0, -1)
+        masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=color)
+        image_overlay = masked.filled()
+        image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
+        # draw contour
+        contours, _ = cv2.findContours(
+            np.uint8(mask["segmentation"]), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+        cv2.drawContours(image, contours, -1, (255, 0, 0), 2)
+    return image
+def segment(
+    predicted_iou_threshold: float,
+    stability_score_threshold: float,
+    clip_threshold: float,
+    image_path: str,
+    query: str,
+) -> PIL.ImageFile.ImageFile:
+    mask_generator = load_mask_generator()
+    # reduce the size to save gpu memory
+    image = adjust_image_size(cv2.imread(image_path))
+    masks = mask_generator.generate(image)
+    masks = filter_masks(
+        image,
+        masks,
+        predicted_iou_threshold,
+        stability_score_threshold,
+        query,
+        clip_threshold,
+    )
+    image = draw_masks(image, masks)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image = PIL.Image.fromarray(np.uint8(image)).convert("RGB")
+    return image
+demo = gr.Interface(
+    fn=segment,
+    inputs=[
+        gr.Slider(0, 1, value=0.9, label="predicted_iou_threshold"),
+        gr.Slider(0, 1, value=0.8, label="stability_score_threshold"),
+        gr.Slider(0, 1, value=0.05, label="clip_threshold"),
+        gr.Image(type="filepath"),
+        "text",
+    ],
+    outputs="image",
+    allow_flagging="never",
+    title="Segment Anything with CLIP",
+    examples=[
+        [
+            0.9,
+            0.8,
+            0.15,
+            os.path.join(os.path.dirname(__file__), "examples/dog.jpg"),
+            "A dog only",
+        ],
+        [
+            0.9,
+            0.8,
+            0.1,
+            os.path.join(os.path.dirname(__file__), "examples/city.jpg"),
+            "A bridge on the water",
+        ],
+        [
+            0.9,
+            0.8,
+            0.05,
+            os.path.join(os.path.dirname(__file__), "examples/food.jpg"),
+            "",
+        ],
+        [
+            0.9,
+            0.8,
+            0.05,
+            os.path.join(os.path.dirname(__file__), "examples/horse.jpg"),
+            "horse",
+        ],
+    ],
+)
+if __name__ == "__main__":
+    demo.launch()

examples/city.jpg ADDED Viewed

examples/dog.jpg ADDED Viewed

examples/food.jpg ADDED Viewed

examples/horse.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==3.24.1
+opencv-python==4.7.0.72
+pycocotools==2.0.6
+matplotlib==3.7.1
+git+https://github.com/facebookresearch/segment-anything.git
+git+https://github.com/openai/CLIP.git

sam_vit_h_4b8939.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e
+size 2564550879