YACWC

2025-06-30 14:19:58 -04:00
parent 21b7ccb794
commit c8dbef2c0f
10 changed files with 96383 additions and 48 deletions
--- a/ml_run.py
+++ b/ml_run.py
@@ -0,0 +1,334 @@
+import sys
+
+sys.path.insert(0, "/home/thebears/source/models/yolov7")
+import time
+from datetime import datetime
+import cv2
+import numpy as np
+from pymediainfo import MediaInfo
+import inspect
+import open_clip
+import sys
+import torch
+import yaml
+from models.experimental import attempt_load
+from utils.general import check_img_size, non_max_suppression
+from torchvision import transforms
+
+device = torch.device("cuda")
+
+pretrained_name = "webli"
+#model_name = "ViT-SO400M-16-SigLIP2-512"
+#model_name = 'ViT-SO400M-14-SigLIP-384'
+
+clip_model, _, clip_preprocess_og = open_clip.create_model_and_transforms(
+    model_name, pretrained=pretrained_name
+)
+tokenizer = open_clip.get_tokenizer('hf-hub:timm/'+model_name)
+labels_list = ["A bird with a brown head and black body", "A bird with a black head and black body"]
+text = tokenizer(labels_list, context_length=clip_model.context_length)
+
+import torch.nn.functional as F
+with torch.no_grad():
+    text_features = clip_model.encode_text(text).detach().cpu()
+    text_features = F.normalize(text_features, dim=-1).detach().cpu()
+# %%
+
+clip_model = clip_model.half().to(device)
+clip_dtype = next(clip_model.parameters()).dtype
+clip_img_size = clip_preprocess_og.transforms[0].size
+_ = clip_model.encode_image(
+    torch.rand(1, 3, *clip_img_size, dtype=clip_dtype, device=device)
+)
+clip_preprocess = transforms.Compose([clip_preprocess_og.transforms[x] for x in [0, 3]])
+
+det_root_path = "/home/thebears/source/model_weights"
+det_model_weights_root = os.path.join(det_root_path, "yolov7")
+det_model_weights_path = os.path.join(det_model_weights_root, "best.pt")
+det_data_yaml_path = os.path.join(det_model_weights_root, "inaturalist.yaml")
+det_model = attempt_load(det_model_weights_path, map_location=device)
+det_model = det_model.half().to(device)
+
+det_dtype = next(det_model.parameters()).dtype
+det_imgsz = 1280
+det_stride = int(det_model.stride.max())
+det_imgsz = check_img_size(det_imgsz, s=det_stride)
+_ = det_model(torch.zeros(1, 3, det_imgsz, det_imgsz, dtype=det_dtype).to(device))
+
+with open(det_data_yaml_path, "r") as ff:
+    det_model_info = yaml.safe_load(ff)
+    det_labels = det_model_info["names"]
+
+
+
+
+array_score = clip_array
+frame_numbers = [x[0] for x in array_score]
+frame_values = [x[1] for x in array_score]
+frame_as_tensor = (
+    torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
+    .to(torch.float16)
+    .to(device)
+    .permute([0, 3, 1, 2])
+)
+def score_frames_det(array_score):
+    frame_numbers = [x[0] for x in array_score]
+    frame_values = [x[1] for x in array_score]
+    frame_as_tensor = (
+        torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
+        .to(torch.float16)
+        .to(device)
+        .permute([0, 3, 1, 2])
+    )
+
+    with torch.no_grad():
+        frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
+        det_preds = det_model(frame_for_model)[0]
+        det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
+        det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
+
+#        frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:])
+#        clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
+
+    return {"det": det_cpu_pred, "fr#": frame_numbers}
+
+def score_frames_clip(array_score):
+    frame_numbers = [x[0] for x in array_score]
+    frame_values = [x[1] for x in array_score]
+    frame_as_tensor = (
+        torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
+        .to(torch.float16)
+        .to(device)
+        .permute([0, 3, 1, 2])
+    )
+
+    with torch.no_grad():
+#        frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
+#        det_preds = det_model(frame_for_model)[0]
+#        det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
+#        det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
+
+        frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:])
+        clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
+
+
+    return {"clip": clip_pred, "fr#": frame_numbers}
+
+
+
+
+
+with torch.no_grad():
+    frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
+    det_preds = det_model(frame_for_model)[0]
+    det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
+    det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
+    frame_for_clip = frame_as_tensor.div(255)
+    
+    frame_for_clip = clip_preprocess(frame_for_clip[:,(2,1,0),:,:])
+    clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
+
+score_result = {"det": det_cpu_pred, "clip": clip_pred, "fr#": frame_numbers}
+
+
+clip_orin = F.normalize(torch.from_numpy(score_result['clip']))
+clip_tree = F.normalize(torch.from_numpy(saved_emb))
+print(np.dot(clip_tree, clip_orin.T))
+
+
+mvo = mean_vec_out[0]
+ooo = frame_for_clip[0].cpu().numpy()
+
+plt.close('all')
+fig = plt.figure()
+
+ax1 = fig.add_subplot(3,2,1)
+ax1.imshow(mvo[0])
+ax2 = fig.add_subplot(3,2,2)
+ax2.imshow(ooo[0])
+ax3 = fig.add_subplot(3,2,3)
+ax3.imshow(mvo[1])
+ax4 = fig.add_subplot(3,2,4)
+ax4.imshow(ooo[1])
+ax5 = fig.add_subplot(3,2,5)
+ax5.imshow(mvo[2])
+ax6 = fig.add_subplot(3,2,6)
+ax6.imshow(ooo[2])
+fig.show()
+
+# %%
+
+
+raw_vec_out
+mean_vec_out
+
+# %%
+file_to_score = "/home/thebears/source/ml_code/short.mp4"
+vec_file = '/home/thebears/source/ml_code/short.npz'
+out = np.load(vec_file)
+
+mean_vec_path = '/home/thebears/source/ml_code/as_np_mean.npy'
+mean_vec_out = np.load(mean_vec_path)
+
+raw_vec_path =  '/home/thebears/source/ml_code/as_np_raw.npy'
+raw_vec_out = np.load(raw_vec_path)
+
+saved_fr = list(out['frame_numbers'])
+saved_emb = out['embeds']
+import numpy as np
+
+
+
+def get_video_info(file_path):
+    file_info = MediaInfo.parse(file_path)
+    video_info = None
+    frame_count = 0
+    if len(file_info.video_tracks) > 0:        video_info = file_info.video_tracks[0]
+
+    video_info.frame_count = int(video_info.frame_count)
+    return video_info
+
+
+video_info = get_video_info(file_to_score)
+vid_decoder = "h264parse"
+if video_info.format.lower() == "HEVC".lower():
+    vid_decoder = "h265parse"
+
+
+gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false".format(
+    file_to_score=file_to_score, vid_decoder=vid_decoder
+)
+
+# gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false !  appsink sync=false".format(file_to_score=file_to_score, vid_decoder=vid_decoder)
+
+cap_handle = cv2.VideoCapture(gst_cmd, cv2.CAP_GSTREAMER)
+
+target_max = det_imgsz
+vid_h = video_info.height
+vid_w = video_info.width
+
+if vid_h > vid_w:
+    target_h = target_max
+    target_w = target_max * vid_w / vid_h
+elif vid_h == vid_w:
+    target_h = target_max
+    target_w = target_max
+elif vid_h < vid_w:
+    target_h = target_max * vid_h / vid_w
+    target_w = target_max
+
+target_h = int(target_h)
+target_w = int(target_w)
+
+pad_amt = [None, None, None, None]
+if target_w % det_stride != 0:
+    off = det_stride - target_w % det_stride
+    new_w = target_w + off
+    pad_diff = new_w - target_w
+    pad_left = round(pad_diff / 2)
+    pad_right = pad_diff - pad_left
+    pad_amt[0] = pad_left
+    pad_amt[2] = pad_right
+else:
+    pad_amt[0] = 0
+    pad_amt[2] = 0
+
+if target_h % det_stride != 0:
+    off = det_stride - target_h % det_stride
+    new_h = target_h + off
+    pad_diff = new_h - target_h
+    pad_up = round(pad_diff / 2)
+    pad_down = pad_diff - pad_up
+    pad_amt[1] = pad_up
+    pad_amt[3] = pad_down
+else:
+    pad_amt[1] = 0
+    pad_amt[3] = 0
+
+
+det_vid_preprocess = transforms.Compose(
+    [transforms.Resize((target_h, target_w)), transforms.Pad(pad_amt, fill=127)]
+)
+
+batch_size = 6
+clip_interval  = 10
+array_score = list()
+final_output = dict()
+final_output["start_score_time"] = time.time()
+final_output["num_frames"] = video_info.frame_count
+st = time.time()
+frame_numbers = list()
+det_results = list()
+clip_results = list()
+
+clip_array = list()
+
+for i in range(video_info.frame_count):
+    success, frame_matrix = cap_handle.read()
+    
+    clip_array.append((i, frame_matrix))
+        
+    if not success:
+        break
+
+
+    array_score.append((i, frame_matrix))
+
+    if len(array_score) >= batch_size:
+        score_result = score_frames(array_score)
+        
+
+        det_results.extend(score_result["det"])
+        clip_results.extend(score_result["clip"])
+        frame_numbers.extend(score_result["fr#"])
+        array_score = list()
+
+    if not (i % clip_interval):
+        print('do_clip')
+
+if len(array_score) > 0:
+    score_result = score_frames(array_score)
+    det_results.extend(score_result["det"])
+    clip_results.extend(score_result["clip"])
+    frame_numbers.extend(score_result["fr#"])
+
+cap_handle.release()
+et = time.time()
+
+final_output["end_score_time"] = time.time()
+final_output["video"] = {
+    "w": vid_w,
+    "h": vid_h,
+    "path": file_to_score,
+    "target_w": target_w,
+    "target_h": target_h,
+    "pad_amt": pad_amt,
+}
+
+try:
+    final_output['scoring_fps'] = final_output['num_frames']/ (final_output['end_score_time'] - final_output['start_score_time'])
+except Exception as e:
+    pass
+
+final_output['scores'] = list()
+
+for frame_number, frame in zip(frame_numbers, det_results):
+    cframe_dict = dict()
+    cframe_dict['frame'] = frame_number
+    cframe_dict['score_number'] = frame_number
+    cframe_dict['detections'] = list()
+
+    for det in frame:
+        data = dict()
+        data['coords'] = [float(x) for x in list(det[0:4])]
+        data['score'] = float(det[4])
+        data['idx'] = int(det[5])
+
+        try:
+            data['name'] = det_labels[data['idx']]
+        except:
+            data['name'] = 'Code failed'
+
+        cframe_dict['detections'].append(data)
+
+    final_output['scores'].append(cframe_dict)