import sys sys.path.insert(0, "/home/thebears/source/models/yolov7") import time from datetime import datetime import cv2 import numpy as np from pymediainfo import MediaInfo import inspect import open_clip import sys import torch import yaml from models.experimental import attempt_load from utils.general import check_img_size, non_max_suppression from torchvision import transforms device = torch.device("cuda") pretrained_name = "webli" #model_name = "ViT-SO400M-16-SigLIP2-512" #model_name = 'ViT-SO400M-14-SigLIP-384' clip_model, _, clip_preprocess_og = open_clip.create_model_and_transforms( model_name, pretrained=pretrained_name ) tokenizer = open_clip.get_tokenizer('hf-hub:timm/'+model_name) labels_list = ["A bird with a brown head and black body", "A bird with a black head and black body"] text = tokenizer(labels_list, context_length=clip_model.context_length) import torch.nn.functional as F with torch.no_grad(): text_features = clip_model.encode_text(text).detach().cpu() text_features = F.normalize(text_features, dim=-1).detach().cpu() # %% clip_model = clip_model.half().to(device) clip_dtype = next(clip_model.parameters()).dtype clip_img_size = clip_preprocess_og.transforms[0].size _ = clip_model.encode_image( torch.rand(1, 3, *clip_img_size, dtype=clip_dtype, device=device) ) clip_preprocess = transforms.Compose([clip_preprocess_og.transforms[x] for x in [0, 3]]) det_root_path = "/home/thebears/source/model_weights" det_model_weights_root = os.path.join(det_root_path, "yolov7") det_model_weights_path = os.path.join(det_model_weights_root, "best.pt") det_data_yaml_path = os.path.join(det_model_weights_root, "inaturalist.yaml") det_model = attempt_load(det_model_weights_path, map_location=device) det_model = det_model.half().to(device) det_dtype = next(det_model.parameters()).dtype det_imgsz = 1280 det_stride = int(det_model.stride.max()) det_imgsz = check_img_size(det_imgsz, s=det_stride) _ = det_model(torch.zeros(1, 3, det_imgsz, det_imgsz, dtype=det_dtype).to(device)) with open(det_data_yaml_path, "r") as ff: det_model_info = yaml.safe_load(ff) det_labels = det_model_info["names"] array_score = clip_array frame_numbers = [x[0] for x in array_score] frame_values = [x[1] for x in array_score] frame_as_tensor = ( torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3]) .to(torch.float16) .to(device) .permute([0, 3, 1, 2]) ) def score_frames_det(array_score): frame_numbers = [x[0] for x in array_score] frame_values = [x[1] for x in array_score] frame_as_tensor = ( torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3]) .to(torch.float16) .to(device) .permute([0, 3, 1, 2]) ) with torch.no_grad(): frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:] det_preds = det_model(frame_for_model)[0] det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5) det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms] # frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:]) # clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy() return {"det": det_cpu_pred, "fr#": frame_numbers} def score_frames_clip(array_score): frame_numbers = [x[0] for x in array_score] frame_values = [x[1] for x in array_score] frame_as_tensor = ( torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3]) .to(torch.float16) .to(device) .permute([0, 3, 1, 2]) ) with torch.no_grad(): # frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:] # det_preds = det_model(frame_for_model)[0] # det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5) # det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms] frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:]) clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy() return {"clip": clip_pred, "fr#": frame_numbers} with torch.no_grad(): frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:] det_preds = det_model(frame_for_model)[0] det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5) det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms] frame_for_clip = frame_as_tensor.div(255) frame_for_clip = clip_preprocess(frame_for_clip[:,(2,1,0),:,:]) clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy() score_result = {"det": det_cpu_pred, "clip": clip_pred, "fr#": frame_numbers} clip_orin = F.normalize(torch.from_numpy(score_result['clip'])) clip_tree = F.normalize(torch.from_numpy(saved_emb)) print(np.dot(clip_tree, clip_orin.T)) mvo = mean_vec_out[0] ooo = frame_for_clip[0].cpu().numpy() plt.close('all') fig = plt.figure() ax1 = fig.add_subplot(3,2,1) ax1.imshow(mvo[0]) ax2 = fig.add_subplot(3,2,2) ax2.imshow(ooo[0]) ax3 = fig.add_subplot(3,2,3) ax3.imshow(mvo[1]) ax4 = fig.add_subplot(3,2,4) ax4.imshow(ooo[1]) ax5 = fig.add_subplot(3,2,5) ax5.imshow(mvo[2]) ax6 = fig.add_subplot(3,2,6) ax6.imshow(ooo[2]) fig.show() # %% raw_vec_out mean_vec_out # %% file_to_score = "/home/thebears/source/ml_code/short.mp4" vec_file = '/home/thebears/source/ml_code/short.npz' out = np.load(vec_file) mean_vec_path = '/home/thebears/source/ml_code/as_np_mean.npy' mean_vec_out = np.load(mean_vec_path) raw_vec_path = '/home/thebears/source/ml_code/as_np_raw.npy' raw_vec_out = np.load(raw_vec_path) saved_fr = list(out['frame_numbers']) saved_emb = out['embeds'] import numpy as np def get_video_info(file_path): file_info = MediaInfo.parse(file_path) video_info = None frame_count = 0 if len(file_info.video_tracks) > 0: video_info = file_info.video_tracks[0] video_info.frame_count = int(video_info.frame_count) return video_info video_info = get_video_info(file_to_score) vid_decoder = "h264parse" if video_info.format.lower() == "HEVC".lower(): vid_decoder = "h265parse" gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false".format( file_to_score=file_to_score, vid_decoder=vid_decoder ) # gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! appsink sync=false".format(file_to_score=file_to_score, vid_decoder=vid_decoder) cap_handle = cv2.VideoCapture(gst_cmd, cv2.CAP_GSTREAMER) target_max = det_imgsz vid_h = video_info.height vid_w = video_info.width if vid_h > vid_w: target_h = target_max target_w = target_max * vid_w / vid_h elif vid_h == vid_w: target_h = target_max target_w = target_max elif vid_h < vid_w: target_h = target_max * vid_h / vid_w target_w = target_max target_h = int(target_h) target_w = int(target_w) pad_amt = [None, None, None, None] if target_w % det_stride != 0: off = det_stride - target_w % det_stride new_w = target_w + off pad_diff = new_w - target_w pad_left = round(pad_diff / 2) pad_right = pad_diff - pad_left pad_amt[0] = pad_left pad_amt[2] = pad_right else: pad_amt[0] = 0 pad_amt[2] = 0 if target_h % det_stride != 0: off = det_stride - target_h % det_stride new_h = target_h + off pad_diff = new_h - target_h pad_up = round(pad_diff / 2) pad_down = pad_diff - pad_up pad_amt[1] = pad_up pad_amt[3] = pad_down else: pad_amt[1] = 0 pad_amt[3] = 0 det_vid_preprocess = transforms.Compose( [transforms.Resize((target_h, target_w)), transforms.Pad(pad_amt, fill=127)] ) batch_size = 6 clip_interval = 10 array_score = list() final_output = dict() final_output["start_score_time"] = time.time() final_output["num_frames"] = video_info.frame_count st = time.time() frame_numbers = list() det_results = list() clip_results = list() clip_array = list() for i in range(video_info.frame_count): success, frame_matrix = cap_handle.read() clip_array.append((i, frame_matrix)) if not success: break array_score.append((i, frame_matrix)) if len(array_score) >= batch_size: score_result = score_frames(array_score) det_results.extend(score_result["det"]) clip_results.extend(score_result["clip"]) frame_numbers.extend(score_result["fr#"]) array_score = list() if not (i % clip_interval): print('do_clip') if len(array_score) > 0: score_result = score_frames(array_score) det_results.extend(score_result["det"]) clip_results.extend(score_result["clip"]) frame_numbers.extend(score_result["fr#"]) cap_handle.release() et = time.time() final_output["end_score_time"] = time.time() final_output["video"] = { "w": vid_w, "h": vid_h, "path": file_to_score, "target_w": target_w, "target_h": target_h, "pad_amt": pad_amt, } try: final_output['scoring_fps'] = final_output['num_frames']/ (final_output['end_score_time'] - final_output['start_score_time']) except Exception as e: pass final_output['scores'] = list() for frame_number, frame in zip(frame_numbers, det_results): cframe_dict = dict() cframe_dict['frame'] = frame_number cframe_dict['score_number'] = frame_number cframe_dict['detections'] = list() for det in frame: data = dict() data['coords'] = [float(x) for x in list(det[0:4])] data['score'] = float(det[4]) data['idx'] = int(det[5]) try: data['name'] = det_labels[data['idx']] except: data['name'] = 'Code failed' cframe_dict['detections'].append(data) final_output['scores'].append(cframe_dict)