Compare commits
2 Commits
c8dbef2c0f
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| faff84dd4c | |||
| 74eff0a1fa |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -5,4 +5,5 @@
|
|||||||
*.trt
|
*.trt
|
||||||
*.whl
|
*.whl
|
||||||
*.npy
|
*.npy
|
||||||
|
*.npz
|
||||||
runs/
|
runs/
|
||||||
|
|||||||
BIN
__pycache__/model_runner.cpython-310.pyc
Normal file
BIN
__pycache__/model_runner.cpython-310.pyc
Normal file
Binary file not shown.
338
ml_run.py
338
ml_run.py
@@ -1,334 +1,8 @@
|
|||||||
import sys
|
from model_runner import ModelRunner
|
||||||
|
mr = ModelRunner()
|
||||||
sys.path.insert(0, "/home/thebears/source/models/yolov7")
|
|
||||||
import time
|
|
||||||
from datetime import datetime
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
from pymediainfo import MediaInfo
|
|
||||||
import inspect
|
|
||||||
import open_clip
|
|
||||||
import sys
|
|
||||||
import torch
|
|
||||||
import yaml
|
|
||||||
from models.experimental import attempt_load
|
|
||||||
from utils.general import check_img_size, non_max_suppression
|
|
||||||
from torchvision import transforms
|
|
||||||
|
|
||||||
device = torch.device("cuda")
|
|
||||||
|
|
||||||
pretrained_name = "webli"
|
|
||||||
#model_name = "ViT-SO400M-16-SigLIP2-512"
|
|
||||||
#model_name = 'ViT-SO400M-14-SigLIP-384'
|
|
||||||
|
|
||||||
clip_model, _, clip_preprocess_og = open_clip.create_model_and_transforms(
|
|
||||||
model_name, pretrained=pretrained_name
|
|
||||||
)
|
|
||||||
tokenizer = open_clip.get_tokenizer('hf-hub:timm/'+model_name)
|
|
||||||
labels_list = ["A bird with a brown head and black body", "A bird with a black head and black body"]
|
|
||||||
text = tokenizer(labels_list, context_length=clip_model.context_length)
|
|
||||||
|
|
||||||
import torch.nn.functional as F
|
|
||||||
with torch.no_grad():
|
|
||||||
text_features = clip_model.encode_text(text).detach().cpu()
|
|
||||||
text_features = F.normalize(text_features, dim=-1).detach().cpu()
|
|
||||||
# %%
|
# %%
|
||||||
|
mr.init_model_det()
|
||||||
clip_model = clip_model.half().to(device)
|
mr.init_model_clip()
|
||||||
clip_dtype = next(clip_model.parameters()).dtype
|
|
||||||
clip_img_size = clip_preprocess_og.transforms[0].size
|
|
||||||
_ = clip_model.encode_image(
|
|
||||||
torch.rand(1, 3, *clip_img_size, dtype=clip_dtype, device=device)
|
|
||||||
)
|
|
||||||
clip_preprocess = transforms.Compose([clip_preprocess_og.transforms[x] for x in [0, 3]])
|
|
||||||
|
|
||||||
det_root_path = "/home/thebears/source/model_weights"
|
|
||||||
det_model_weights_root = os.path.join(det_root_path, "yolov7")
|
|
||||||
det_model_weights_path = os.path.join(det_model_weights_root, "best.pt")
|
|
||||||
det_data_yaml_path = os.path.join(det_model_weights_root, "inaturalist.yaml")
|
|
||||||
det_model = attempt_load(det_model_weights_path, map_location=device)
|
|
||||||
det_model = det_model.half().to(device)
|
|
||||||
|
|
||||||
det_dtype = next(det_model.parameters()).dtype
|
|
||||||
det_imgsz = 1280
|
|
||||||
det_stride = int(det_model.stride.max())
|
|
||||||
det_imgsz = check_img_size(det_imgsz, s=det_stride)
|
|
||||||
_ = det_model(torch.zeros(1, 3, det_imgsz, det_imgsz, dtype=det_dtype).to(device))
|
|
||||||
|
|
||||||
with open(det_data_yaml_path, "r") as ff:
|
|
||||||
det_model_info = yaml.safe_load(ff)
|
|
||||||
det_labels = det_model_info["names"]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
array_score = clip_array
|
|
||||||
frame_numbers = [x[0] for x in array_score]
|
|
||||||
frame_values = [x[1] for x in array_score]
|
|
||||||
frame_as_tensor = (
|
|
||||||
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
|
|
||||||
.to(torch.float16)
|
|
||||||
.to(device)
|
|
||||||
.permute([0, 3, 1, 2])
|
|
||||||
)
|
|
||||||
def score_frames_det(array_score):
|
|
||||||
frame_numbers = [x[0] for x in array_score]
|
|
||||||
frame_values = [x[1] for x in array_score]
|
|
||||||
frame_as_tensor = (
|
|
||||||
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
|
|
||||||
.to(torch.float16)
|
|
||||||
.to(device)
|
|
||||||
.permute([0, 3, 1, 2])
|
|
||||||
)
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
|
|
||||||
det_preds = det_model(frame_for_model)[0]
|
|
||||||
det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
|
|
||||||
det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
|
|
||||||
|
|
||||||
# frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:])
|
|
||||||
# clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
|
|
||||||
|
|
||||||
return {"det": det_cpu_pred, "fr#": frame_numbers}
|
|
||||||
|
|
||||||
def score_frames_clip(array_score):
|
|
||||||
frame_numbers = [x[0] for x in array_score]
|
|
||||||
frame_values = [x[1] for x in array_score]
|
|
||||||
frame_as_tensor = (
|
|
||||||
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
|
|
||||||
.to(torch.float16)
|
|
||||||
.to(device)
|
|
||||||
.permute([0, 3, 1, 2])
|
|
||||||
)
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
# frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
|
|
||||||
# det_preds = det_model(frame_for_model)[0]
|
|
||||||
# det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
|
|
||||||
# det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
|
|
||||||
|
|
||||||
frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:])
|
|
||||||
clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
|
|
||||||
|
|
||||||
|
|
||||||
return {"clip": clip_pred, "fr#": frame_numbers}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
|
|
||||||
det_preds = det_model(frame_for_model)[0]
|
|
||||||
det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
|
|
||||||
det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
|
|
||||||
frame_for_clip = frame_as_tensor.div(255)
|
|
||||||
|
|
||||||
frame_for_clip = clip_preprocess(frame_for_clip[:,(2,1,0),:,:])
|
|
||||||
clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
|
|
||||||
|
|
||||||
score_result = {"det": det_cpu_pred, "clip": clip_pred, "fr#": frame_numbers}
|
|
||||||
|
|
||||||
|
|
||||||
clip_orin = F.normalize(torch.from_numpy(score_result['clip']))
|
|
||||||
clip_tree = F.normalize(torch.from_numpy(saved_emb))
|
|
||||||
print(np.dot(clip_tree, clip_orin.T))
|
|
||||||
|
|
||||||
|
|
||||||
mvo = mean_vec_out[0]
|
|
||||||
ooo = frame_for_clip[0].cpu().numpy()
|
|
||||||
|
|
||||||
plt.close('all')
|
|
||||||
fig = plt.figure()
|
|
||||||
|
|
||||||
ax1 = fig.add_subplot(3,2,1)
|
|
||||||
ax1.imshow(mvo[0])
|
|
||||||
ax2 = fig.add_subplot(3,2,2)
|
|
||||||
ax2.imshow(ooo[0])
|
|
||||||
ax3 = fig.add_subplot(3,2,3)
|
|
||||||
ax3.imshow(mvo[1])
|
|
||||||
ax4 = fig.add_subplot(3,2,4)
|
|
||||||
ax4.imshow(ooo[1])
|
|
||||||
ax5 = fig.add_subplot(3,2,5)
|
|
||||||
ax5.imshow(mvo[2])
|
|
||||||
ax6 = fig.add_subplot(3,2,6)
|
|
||||||
ax6.imshow(ooo[2])
|
|
||||||
fig.show()
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
scored_results = mr.score_video('/home/thebears/source/ml_code/short.mp4')
|
||||||
|
print(scored_results)
|
||||||
raw_vec_out
|
|
||||||
mean_vec_out
|
|
||||||
|
|
||||||
# %%
|
|
||||||
file_to_score = "/home/thebears/source/ml_code/short.mp4"
|
|
||||||
vec_file = '/home/thebears/source/ml_code/short.npz'
|
|
||||||
out = np.load(vec_file)
|
|
||||||
|
|
||||||
mean_vec_path = '/home/thebears/source/ml_code/as_np_mean.npy'
|
|
||||||
mean_vec_out = np.load(mean_vec_path)
|
|
||||||
|
|
||||||
raw_vec_path = '/home/thebears/source/ml_code/as_np_raw.npy'
|
|
||||||
raw_vec_out = np.load(raw_vec_path)
|
|
||||||
|
|
||||||
saved_fr = list(out['frame_numbers'])
|
|
||||||
saved_emb = out['embeds']
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_video_info(file_path):
|
|
||||||
file_info = MediaInfo.parse(file_path)
|
|
||||||
video_info = None
|
|
||||||
frame_count = 0
|
|
||||||
if len(file_info.video_tracks) > 0: video_info = file_info.video_tracks[0]
|
|
||||||
|
|
||||||
video_info.frame_count = int(video_info.frame_count)
|
|
||||||
return video_info
|
|
||||||
|
|
||||||
|
|
||||||
video_info = get_video_info(file_to_score)
|
|
||||||
vid_decoder = "h264parse"
|
|
||||||
if video_info.format.lower() == "HEVC".lower():
|
|
||||||
vid_decoder = "h265parse"
|
|
||||||
|
|
||||||
|
|
||||||
gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false".format(
|
|
||||||
file_to_score=file_to_score, vid_decoder=vid_decoder
|
|
||||||
)
|
|
||||||
|
|
||||||
# gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! appsink sync=false".format(file_to_score=file_to_score, vid_decoder=vid_decoder)
|
|
||||||
|
|
||||||
cap_handle = cv2.VideoCapture(gst_cmd, cv2.CAP_GSTREAMER)
|
|
||||||
|
|
||||||
target_max = det_imgsz
|
|
||||||
vid_h = video_info.height
|
|
||||||
vid_w = video_info.width
|
|
||||||
|
|
||||||
if vid_h > vid_w:
|
|
||||||
target_h = target_max
|
|
||||||
target_w = target_max * vid_w / vid_h
|
|
||||||
elif vid_h == vid_w:
|
|
||||||
target_h = target_max
|
|
||||||
target_w = target_max
|
|
||||||
elif vid_h < vid_w:
|
|
||||||
target_h = target_max * vid_h / vid_w
|
|
||||||
target_w = target_max
|
|
||||||
|
|
||||||
target_h = int(target_h)
|
|
||||||
target_w = int(target_w)
|
|
||||||
|
|
||||||
pad_amt = [None, None, None, None]
|
|
||||||
if target_w % det_stride != 0:
|
|
||||||
off = det_stride - target_w % det_stride
|
|
||||||
new_w = target_w + off
|
|
||||||
pad_diff = new_w - target_w
|
|
||||||
pad_left = round(pad_diff / 2)
|
|
||||||
pad_right = pad_diff - pad_left
|
|
||||||
pad_amt[0] = pad_left
|
|
||||||
pad_amt[2] = pad_right
|
|
||||||
else:
|
|
||||||
pad_amt[0] = 0
|
|
||||||
pad_amt[2] = 0
|
|
||||||
|
|
||||||
if target_h % det_stride != 0:
|
|
||||||
off = det_stride - target_h % det_stride
|
|
||||||
new_h = target_h + off
|
|
||||||
pad_diff = new_h - target_h
|
|
||||||
pad_up = round(pad_diff / 2)
|
|
||||||
pad_down = pad_diff - pad_up
|
|
||||||
pad_amt[1] = pad_up
|
|
||||||
pad_amt[3] = pad_down
|
|
||||||
else:
|
|
||||||
pad_amt[1] = 0
|
|
||||||
pad_amt[3] = 0
|
|
||||||
|
|
||||||
|
|
||||||
det_vid_preprocess = transforms.Compose(
|
|
||||||
[transforms.Resize((target_h, target_w)), transforms.Pad(pad_amt, fill=127)]
|
|
||||||
)
|
|
||||||
|
|
||||||
batch_size = 6
|
|
||||||
clip_interval = 10
|
|
||||||
array_score = list()
|
|
||||||
final_output = dict()
|
|
||||||
final_output["start_score_time"] = time.time()
|
|
||||||
final_output["num_frames"] = video_info.frame_count
|
|
||||||
st = time.time()
|
|
||||||
frame_numbers = list()
|
|
||||||
det_results = list()
|
|
||||||
clip_results = list()
|
|
||||||
|
|
||||||
clip_array = list()
|
|
||||||
|
|
||||||
for i in range(video_info.frame_count):
|
|
||||||
success, frame_matrix = cap_handle.read()
|
|
||||||
|
|
||||||
clip_array.append((i, frame_matrix))
|
|
||||||
|
|
||||||
if not success:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
array_score.append((i, frame_matrix))
|
|
||||||
|
|
||||||
if len(array_score) >= batch_size:
|
|
||||||
score_result = score_frames(array_score)
|
|
||||||
|
|
||||||
|
|
||||||
det_results.extend(score_result["det"])
|
|
||||||
clip_results.extend(score_result["clip"])
|
|
||||||
frame_numbers.extend(score_result["fr#"])
|
|
||||||
array_score = list()
|
|
||||||
|
|
||||||
if not (i % clip_interval):
|
|
||||||
print('do_clip')
|
|
||||||
|
|
||||||
if len(array_score) > 0:
|
|
||||||
score_result = score_frames(array_score)
|
|
||||||
det_results.extend(score_result["det"])
|
|
||||||
clip_results.extend(score_result["clip"])
|
|
||||||
frame_numbers.extend(score_result["fr#"])
|
|
||||||
|
|
||||||
cap_handle.release()
|
|
||||||
et = time.time()
|
|
||||||
|
|
||||||
final_output["end_score_time"] = time.time()
|
|
||||||
final_output["video"] = {
|
|
||||||
"w": vid_w,
|
|
||||||
"h": vid_h,
|
|
||||||
"path": file_to_score,
|
|
||||||
"target_w": target_w,
|
|
||||||
"target_h": target_h,
|
|
||||||
"pad_amt": pad_amt,
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
final_output['scoring_fps'] = final_output['num_frames']/ (final_output['end_score_time'] - final_output['start_score_time'])
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
final_output['scores'] = list()
|
|
||||||
|
|
||||||
for frame_number, frame in zip(frame_numbers, det_results):
|
|
||||||
cframe_dict = dict()
|
|
||||||
cframe_dict['frame'] = frame_number
|
|
||||||
cframe_dict['score_number'] = frame_number
|
|
||||||
cframe_dict['detections'] = list()
|
|
||||||
|
|
||||||
for det in frame:
|
|
||||||
data = dict()
|
|
||||||
data['coords'] = [float(x) for x in list(det[0:4])]
|
|
||||||
data['score'] = float(det[4])
|
|
||||||
data['idx'] = int(det[5])
|
|
||||||
|
|
||||||
try:
|
|
||||||
data['name'] = det_labels[data['idx']]
|
|
||||||
except:
|
|
||||||
data['name'] = 'Code failed'
|
|
||||||
|
|
||||||
cframe_dict['detections'].append(data)
|
|
||||||
|
|
||||||
final_output['scores'].append(cframe_dict)
|
|
||||||
|
|||||||
310
model_runner.py
Normal file
310
model_runner.py
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, "/home/thebears/source/models/yolov7")
|
||||||
|
import time
|
||||||
|
import base64 as b64
|
||||||
|
from datetime import datetime
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import json
|
||||||
|
from pymediainfo import MediaInfo
|
||||||
|
import inspect
|
||||||
|
import open_clip
|
||||||
|
import sys
|
||||||
|
import torch
|
||||||
|
import yaml
|
||||||
|
from models.experimental import attempt_load
|
||||||
|
from utils.general import check_img_size, non_max_suppression
|
||||||
|
from torchvision import transforms
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import os
|
||||||
|
device = torch.device("cuda")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
class ModelRunner:
|
||||||
|
def __init__(self):
|
||||||
|
self.pretrained_name = "webli"
|
||||||
|
self.model_name = "ViT-SO400M-16-SigLIP2-512"
|
||||||
|
self.det_root_path = "/home/thebears/source/model_weights"
|
||||||
|
|
||||||
|
def init_model_clip(self):
|
||||||
|
if hasattr(self, 'clip_preprocess'):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
model_name = self.model_name
|
||||||
|
pretrained_name = self.pretrained_name
|
||||||
|
|
||||||
|
clip_model, _, clip_preprocess_og = open_clip.create_model_and_transforms(
|
||||||
|
model_name, pretrained=pretrained_name
|
||||||
|
)
|
||||||
|
tokenizer = open_clip.get_tokenizer("hf-hub:timm/" + model_name)
|
||||||
|
clip_model = clip_model.half().to(device)
|
||||||
|
clip_dtype = next(clip_model.parameters()).dtype
|
||||||
|
clip_img_size = clip_preprocess_og.transforms[0].size
|
||||||
|
clip_model.encode_image(
|
||||||
|
torch.rand(1, 3, *clip_img_size, dtype=clip_dtype, device=device))
|
||||||
|
clip_preprocess = transforms.Compose(
|
||||||
|
[clip_preprocess_og.transforms[x] for x in [0, 3]]
|
||||||
|
)
|
||||||
|
self.clip_model = clip_model
|
||||||
|
self.clip_preprocess_og = clip_preprocess_og
|
||||||
|
self.clip_tokenizer = tokenizer
|
||||||
|
self.clip_dtype = clip_dtype
|
||||||
|
self.clip_img_size = clip_img_size
|
||||||
|
self.clip_preprocess = clip_preprocess
|
||||||
|
|
||||||
|
def init_model_det(self):
|
||||||
|
if hasattr(self, 'det_model'):
|
||||||
|
return
|
||||||
|
|
||||||
|
det_root_path = self.det_root_path
|
||||||
|
det_model_weights_root = os.path.join(det_root_path, "yolov7")
|
||||||
|
det_model_weights_path = os.path.join(det_model_weights_root, "best.pt")
|
||||||
|
det_data_yaml_path = os.path.join(det_model_weights_root, "inaturalist.yaml")
|
||||||
|
det_model = attempt_load(det_model_weights_path, map_location=device)
|
||||||
|
det_model = det_model.half().to(device)
|
||||||
|
|
||||||
|
det_dtype = next(det_model.parameters()).dtype
|
||||||
|
det_imgsz = 1280
|
||||||
|
det_stride = int(det_model.stride.max())
|
||||||
|
det_imgsz = check_img_size(det_imgsz, s=det_stride)
|
||||||
|
_ = det_model(
|
||||||
|
torch.zeros(1, 3, det_imgsz, det_imgsz, dtype=det_dtype).to(device)
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(det_data_yaml_path, "r") as ff:
|
||||||
|
det_model_info = yaml.safe_load(ff)
|
||||||
|
det_labels = det_model_info["names"]
|
||||||
|
|
||||||
|
self.det_dtype = det_dtype
|
||||||
|
self.det_imgsz = det_imgsz
|
||||||
|
self.det_stride = det_stride
|
||||||
|
self.det_model_info = det_model_info
|
||||||
|
self.det_labels = det_labels
|
||||||
|
self.det_model = det_model
|
||||||
|
|
||||||
|
def get_det_vid_preprocessor(self, vid_h, vid_w):
|
||||||
|
if not hasattr(self, "_det_vid_preprocessors"):
|
||||||
|
self._det_vid_preprocessors = dict()
|
||||||
|
self.curr_det_vid_preprocessor = None
|
||||||
|
dict_key = (vid_h, vid_w)
|
||||||
|
det_stride = self.det_stride
|
||||||
|
if dict_key in self._det_vid_preprocessors:
|
||||||
|
self.curr_det_vid_preprocessor = self._det_vid_preprocessors[dict_key]
|
||||||
|
return self.curr_det_vid_preprocessor
|
||||||
|
|
||||||
|
target_max = self.det_imgsz
|
||||||
|
|
||||||
|
if vid_h > vid_w:
|
||||||
|
target_h = target_max
|
||||||
|
target_w = target_max * vid_w / vid_h
|
||||||
|
elif vid_h == vid_w:
|
||||||
|
target_h = target_max
|
||||||
|
target_w = target_max
|
||||||
|
elif vid_h < vid_w:
|
||||||
|
target_h = target_max * vid_h / vid_w
|
||||||
|
target_w = target_max
|
||||||
|
|
||||||
|
target_h = int(target_h)
|
||||||
|
target_w = int(target_w)
|
||||||
|
|
||||||
|
pad_amt = [None, None, None, None]
|
||||||
|
if target_w % det_stride != 0:
|
||||||
|
off = det_stride - target_w % det_stride
|
||||||
|
new_w = target_w + off
|
||||||
|
pad_diff = new_w - target_w
|
||||||
|
pad_left = round(pad_diff / 2)
|
||||||
|
pad_right = pad_diff - pad_left
|
||||||
|
pad_amt[0] = pad_left
|
||||||
|
pad_amt[2] = pad_right
|
||||||
|
else:
|
||||||
|
pad_amt[0] = 0
|
||||||
|
pad_amt[2] = 0
|
||||||
|
|
||||||
|
if target_h % det_stride != 0:
|
||||||
|
off = det_stride - target_h % det_stride
|
||||||
|
new_h = target_h + off
|
||||||
|
pad_diff = new_h - target_h
|
||||||
|
pad_up = round(pad_diff / 2)
|
||||||
|
pad_down = pad_diff - pad_up
|
||||||
|
pad_amt[1] = pad_up
|
||||||
|
pad_amt[3] = pad_down
|
||||||
|
else:
|
||||||
|
pad_amt[1] = 0
|
||||||
|
pad_amt[3] = 0
|
||||||
|
|
||||||
|
det_vid_preprocess = transforms.Compose(
|
||||||
|
[transforms.Resize((target_h, target_w)), transforms.Pad(pad_amt, fill=127)]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.target_h = target_h
|
||||||
|
self.target_w = target_w
|
||||||
|
self.pad_amt = pad_amt
|
||||||
|
|
||||||
|
self._det_vid_preprocessors[dict_key] = det_vid_preprocess
|
||||||
|
self.curr_det_vid_preprocessor = self._det_vid_preprocessors[dict_key]
|
||||||
|
return self.curr_det_vid_preprocessor
|
||||||
|
|
||||||
|
def score_frames_det(self, array_score, det_vid_preprocess=None):
|
||||||
|
det_model = self.det_model
|
||||||
|
if det_vid_preprocess is None:
|
||||||
|
det_vid_preprocess = self.curr_det_vid_preprocessor
|
||||||
|
|
||||||
|
frame_numbers = [x[0] for x in array_score]
|
||||||
|
frame_values = [x[1] for x in array_score]
|
||||||
|
frame_as_tensor = (
|
||||||
|
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
|
||||||
|
.to(torch.float16)
|
||||||
|
.to(device)
|
||||||
|
.permute([0, 3, 1, 2])
|
||||||
|
)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[
|
||||||
|
:, [2, 1, 0], :, :
|
||||||
|
]
|
||||||
|
det_preds = det_model(frame_for_model)[0]
|
||||||
|
det_pred_post_nms = non_max_suppression(det_preds, 0.25, 0.5)
|
||||||
|
det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
|
||||||
|
|
||||||
|
return {"det": det_cpu_pred, "fr#": frame_numbers}
|
||||||
|
|
||||||
|
def score_frames_clip(self, clip_array_score):
|
||||||
|
frame_numbers = [x[0] for x in clip_array_score]
|
||||||
|
frame_values = [x[1] for x in clip_array_score]
|
||||||
|
frame_as_tensor = (
|
||||||
|
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
|
||||||
|
.to(torch.float16)
|
||||||
|
.to(device)
|
||||||
|
.permute([0, 3, 1, 2])
|
||||||
|
)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
frame_for_clip = self.clip_preprocess(frame_as_tensor[:, [0, 1, 2], :, :])
|
||||||
|
clip_pred = self.clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
|
||||||
|
|
||||||
|
return {"clip": clip_pred, "fr#": frame_numbers}
|
||||||
|
|
||||||
|
def get_video_info(self, file_path):
|
||||||
|
file_info = MediaInfo.parse(file_path)
|
||||||
|
video_info = None
|
||||||
|
frame_count = 0
|
||||||
|
if len(file_info.video_tracks) > 0:
|
||||||
|
video_info = file_info.video_tracks[0]
|
||||||
|
|
||||||
|
video_info.frame_count = int(video_info.frame_count)
|
||||||
|
return video_info
|
||||||
|
|
||||||
|
def score_video(self, file_to_score, batch_size = 6, clip_interval = 10):
|
||||||
|
video_info = self.get_video_info(file_to_score)
|
||||||
|
vid_decoder = "h264parse"
|
||||||
|
if video_info.format.lower() == "HEVC".lower():
|
||||||
|
vid_decoder = "h265parse"
|
||||||
|
|
||||||
|
gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false".format(
|
||||||
|
file_to_score=file_to_score, vid_decoder=vid_decoder
|
||||||
|
)
|
||||||
|
cap_handle = cv2.VideoCapture(gst_cmd, cv2.CAP_GSTREAMER)
|
||||||
|
vid_h = video_info.height
|
||||||
|
vid_w = video_info.width
|
||||||
|
|
||||||
|
vid_preprocessor = self.get_det_vid_preprocessor(vid_h, vid_w)
|
||||||
|
target_w = self.target_w
|
||||||
|
target_h = self.target_h
|
||||||
|
pad_amt = self.pad_amt
|
||||||
|
|
||||||
|
|
||||||
|
array_score = list()
|
||||||
|
final_output = dict()
|
||||||
|
final_output["start_score_time"] = time.time()
|
||||||
|
final_output["num_frames"] = video_info.frame_count
|
||||||
|
st = time.time()
|
||||||
|
frame_numbers = list()
|
||||||
|
det_results = list()
|
||||||
|
clip_results = list()
|
||||||
|
clip_frame_numbers = list()
|
||||||
|
|
||||||
|
clip_array = list()
|
||||||
|
|
||||||
|
for i in range(video_info.frame_count):
|
||||||
|
success, frame_matrix = cap_handle.read()
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
break
|
||||||
|
|
||||||
|
array_score.append((i, frame_matrix))
|
||||||
|
|
||||||
|
if len(array_score) >= batch_size:
|
||||||
|
score_result = self.score_frames_det(array_score, det_vid_preprocess = vid_preprocessor)
|
||||||
|
det_results.extend(score_result["det"])
|
||||||
|
frame_numbers.extend(score_result["fr#"])
|
||||||
|
array_score = list()
|
||||||
|
|
||||||
|
if not (i % clip_interval):
|
||||||
|
clip_score_result = self.score_frames_clip([(i, frame_matrix)])
|
||||||
|
clip_results.extend(clip_score_result["clip"])
|
||||||
|
clip_frame_numbers.extend(clip_score_result["fr#"])
|
||||||
|
|
||||||
|
|
||||||
|
if len(array_score) > 0:
|
||||||
|
score_result = self.score_frames_det(array_score, det_vid_preprocess = vid_preprocessor)
|
||||||
|
det_results.extend(score_result["det"])
|
||||||
|
frame_numbers.extend(score_result["fr#"])
|
||||||
|
|
||||||
|
cap_handle.release()
|
||||||
|
|
||||||
|
|
||||||
|
final_output["end_score_time"] = time.time()
|
||||||
|
final_output["video"] = {
|
||||||
|
"w": vid_w,
|
||||||
|
"h": vid_h,
|
||||||
|
"path": file_to_score,
|
||||||
|
"target_w": target_w,
|
||||||
|
"target_h": target_h,
|
||||||
|
"pad_amt": pad_amt,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
final_output["scoring_fps"] = final_output["num_frames"] / (
|
||||||
|
final_output["end_score_time"] - final_output["start_score_time"]
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
final_output["scores"] = list()
|
||||||
|
|
||||||
|
|
||||||
|
clip_results_as_np = np.asarray(clip_results)
|
||||||
|
|
||||||
|
for frame_number, frame in zip(frame_numbers, det_results):
|
||||||
|
cframe_dict = dict()
|
||||||
|
cframe_dict["frame"] = frame_number
|
||||||
|
cframe_dict["detections"] = list()
|
||||||
|
|
||||||
|
for det in frame:
|
||||||
|
data = dict()
|
||||||
|
data["coords"] = [float(x) for x in list(det[0:4])]
|
||||||
|
data["score"] = float(det[4])
|
||||||
|
data["idx"] = int(det[5])
|
||||||
|
|
||||||
|
try:
|
||||||
|
data["name"] = det_labels[data["idx"]]
|
||||||
|
except:
|
||||||
|
data["name"] = "Code failed"
|
||||||
|
|
||||||
|
cframe_dict["detections"].append(data)
|
||||||
|
|
||||||
|
final_output["scores"].append(cframe_dict)
|
||||||
|
|
||||||
|
emb_dict = dict()
|
||||||
|
|
||||||
|
emb_dict["frame_numbers"] = clip_frame_numbers
|
||||||
|
emb_dict["array_size"] = clip_results_as_np.shape
|
||||||
|
emb_dict["array_dtype"] = str(clip_results_as_np.dtype)
|
||||||
|
emb_dict["array_binary"] = b64.b64encode(clip_results_as_np).decode()
|
||||||
|
|
||||||
|
final_output["embeds"] = emb_dict
|
||||||
|
|
||||||
|
return final_output
|
||||||
Reference in New Issue
Block a user