Spaces:

AIBoy1993
/

segment_anything_webui

Build error

App Files Files Community

segment_anything_webui / inference.py

AIBoy1993

Upload 2 files

ae97c0d over 2 years ago

raw

history blame contribute delete

7.93 kB

	import os
	import cv2
	import torch
	import numpy as np
	import gradio as gr
	from PIL import Image, ImageDraw
	from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
	from transformers import OwlViTProcessor, OwlViTForObjectDetection
	import gc

	models = {
	'vit_b': './checkpoints/sam_vit_b_01ec64.pth',
	'vit_l': './checkpoints/sam_vit_l_0b3195.pth',
	'vit_h': './checkpoints/sam_vit_h_4b8939.pth'
	}

	image_examples = [
	[os.path.join(os.path.dirname(__file__), "./images/53960-scaled.jpg"), 0, []],
	[os.path.join(os.path.dirname(__file__), "./images/2388455-scaled.jpg"), 1, []],
	[os.path.join(os.path.dirname(__file__), "./images/1.jpg"),2,[]],
	[os.path.join(os.path.dirname(__file__), "./images/2.jpg"),3,[]],
	[os.path.join(os.path.dirname(__file__), "./images/3.jpg"),4,[]],
	[os.path.join(os.path.dirname(__file__), "./images/4.jpg"),5,[]],
	[os.path.join(os.path.dirname(__file__), "./images/5.jpg"),6,[]],
	[os.path.join(os.path.dirname(__file__), "./images/6.jpg"),7,[]],
	[os.path.join(os.path.dirname(__file__), "./images/7.jpg"),8,[]],
	[os.path.join(os.path.dirname(__file__), "./images/8.jpg"),9,[]]
	]


	def plot_boxes(img, boxes):
	img_pil = Image.fromarray(np.uint8(img * 255)).convert('RGB')
	draw = ImageDraw.Draw(img_pil)
	for box in boxes:
	color = tuple(np.random.randint(0, 255, size=3).tolist())
	x0, y0, x1, y1 = box
	x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
	draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
	return img_pil


	def segment_one(img, mask_generator, seed=None):
	if seed is not None:
	np.random.seed(seed)
	masks = mask_generator.generate(img)
	sorted_anns = sorted(masks, key=(lambda x: x['area']), reverse=True)
	mask_all = np.ones((img.shape[0], img.shape[1], 3))
	for ann in sorted_anns:
	m = ann['segmentation']
	color_mask = np.random.random((1, 3)).tolist()[0]
	for i in range(3):
	mask_all[m == True, i] = color_mask[i]
	result = img / 255 * 0.3 + mask_all * 0.7
	return result, mask_all


	def generator_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
	min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh,
	input_x, progress=gr.Progress()):
	# sam model
	sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
	mask_generator = SamAutomaticMaskGenerator(
	sam,
	points_per_side=points_per_side,
	pred_iou_thresh=pred_iou_thresh,
	stability_score_thresh=stability_score_thresh,
	stability_score_offset=stability_score_offset,
	box_nms_thresh=box_nms_thresh,
	crop_n_layers=crop_n_layers,
	crop_nms_thresh=crop_nms_thresh,
	crop_overlap_ratio=512 / 1500,
	crop_n_points_downscale_factor=1,
	point_grids=None,
	min_mask_region_area=min_mask_region_area,
	output_mode='binary_mask'
	)

	# input is image, type: numpy
	if type(input_x) == np.ndarray:
	result, mask_all = segment_one(input_x, mask_generator)
	return result, mask_all
	elif isinstance(input_x, str): # input is video, type: path (str)
	cap = cv2.VideoCapture(input_x) # read video
	frames_num = cap.get(cv2.CAP_PROP_FRAME_COUNT)
	W, H = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = int(cap.get(cv2.CAP_PROP_FPS))
	out = cv2.VideoWriter("output.mp4", cv2.VideoWriter_fourcc('x', '2', '6', '4'), fps, (W, H), isColor=True)
	for _ in progress.tqdm(range(int(frames_num)),
	desc='Processing video ({} frames, size {}x{})'.format(int(frames_num), W, H)):
	ret, frame = cap.read() # read a frame
	result, mask_all = segment_one(frame, mask_generator, seed=2023)
	result = (result * 255).astype(np.uint8)
	out.write(result)
	out.release()
	cap.release()
	return 'output.mp4'


	def predictor_inference(device, model_type, input_x, input_text, selected_points, owl_vit_threshold=0.1):
	# sam model
	sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
	predictor = SamPredictor(sam)
	predictor.set_image(input_x) # Process the image to produce an image embedding

	if input_text != '':
	# split input text
	input_text = [input_text.split(',')]
	print(input_text)
	# OWL-ViT model
	processor = OwlViTProcessor.from_pretrained('./checkpoints/models--google--owlvit-base-patch32')
	owlvit_model = OwlViTForObjectDetection.from_pretrained("./checkpoints/models--google--owlvit-base-patch32").to(device)
	# get outputs
	input_text = processor(text=input_text, images=input_x, return_tensors="pt").to(device)
	outputs = owlvit_model(**input_text)
	target_size = torch.Tensor([input_x.shape[:2]]).to(device)
	results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_size,
	threshold=owl_vit_threshold)

	# get the box with best score
	scores = torch.sigmoid(outputs.logits)
	# best_scores, best_idxs = torch.topk(scores, k=1, dim=1)
	# best_idxs = best_idxs.squeeze(1).tolist()

	i = 0 # Retrieve predictions for the first image for the corresponding text queries
	boxes_tensor = results[i]["boxes"] # [best_idxs]
	boxes = boxes_tensor.cpu().detach().numpy()
	# boxes = boxes[np.newaxis, :, :]
	transformed_boxes = predictor.transform.apply_boxes_torch(torch.Tensor(boxes).to(device),
	input_x.shape[:2]) # apply transform to original boxes
	# transformed_boxes = transformed_boxes.unsqueeze(0)
	print(transformed_boxes.size(), boxes.shape)
	else:
	transformed_boxes = None

	# points
	if len(selected_points) != 0:
	points = torch.Tensor([p for p, _ in selected_points]).to(device).unsqueeze(1)
	labels = torch.Tensor([int(l) for _, l in selected_points]).to(device).unsqueeze(1)
	transformed_points = predictor.transform.apply_coords_torch(points, input_x.shape[:2])
	print(points.size(), transformed_points.size(), labels.size(), input_x.shape, points)
	else:
	transformed_points, labels = None, None

	# predict segmentation according to the boxes
	masks, scores, logits = predictor.predict_torch(
	point_coords=transformed_points,
	point_labels=labels,
	boxes=transformed_boxes, # only one box
	multimask_output=False,
	)
	masks = masks.cpu().detach().numpy()
	mask_all = np.ones((input_x.shape[0], input_x.shape[1], 3))
	for ann in masks:
	color_mask = np.random.random((1, 3)).tolist()[0]
	for i in range(3):
	mask_all[ann[0] == True, i] = color_mask[i]
	img = input_x / 255 * 0.3 + mask_all * 0.7
	if input_text != '':
	img = plot_boxes(img, boxes_tensor) # image + mask + boxes

	# free the memory
	if input_text != '':
	owlvit_model.cpu()
	del owlvit_model
	del input_text
	gc.collect()
	torch.cuda.empty_cache()

	return img, mask_all


	def run_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh, min_mask_region_area,
	stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh, owl_vit_threshold, input_x,
	input_text, selected_points):
	# if input_x is int, the image is selected from examples
	if isinstance(input_x, int):
	input_x = cv2.imread(image_examples[input_x][0])
	input_x = cv2.cvtColor(input_x, cv2.COLOR_BGR2RGB)
	if (input_text != '' and not isinstance(input_x, str)) or len(selected_points) != 0: # user input text or points
	print('use predictor_inference')
	print('prompt text: ', input_text)
	print('prompt points length: ', len(selected_points))
	return predictor_inference(device, model_type, input_x, input_text, selected_points, owl_vit_threshold)
	else:
	print('use generator_inference')
	return generator_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
	min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
	crop_nms_thresh, input_x)