Compare commits

..

2 Commits

Author SHA1 Message Date
c8dbef2c0f YACWC 2025-06-30 14:19:58 -04:00
21b7ccb794 bump 2025-06-30 14:19:57 -04:00
11 changed files with 96385 additions and 48 deletions

2
.gitignore vendored
View File

@@ -3,4 +3,6 @@
*.pt *.pt
*.onnx *.onnx
*.trt *.trt
*.whl
*.npy
runs/ runs/

View File

@@ -1,6 +1,9 @@
import numpy as np
import json import json
datum = np.load('dump.npz.npy')
import numpy as np
#datum = np.load('dump.npz.npy')
datum = np.load('dump_so400m.npy')
with open('dump.json','r') as rr: with open('dump.json','r') as rr:
@@ -29,25 +32,15 @@ def cosine_sim(emb_in_1, emb_in_2):
arr_in_deepstream = np.asarray([y for _,y in emb_dict.items()]) arr_in_deepstream = np.asarray([y for _,y in emb_dict.items()])
normed = np.divide(datum.T, np.linalg.norm(datum, axis=1)).T normed = np.divide(datum.T, np.linalg.norm(datum, axis=1)).T
print('_________________________') print('_________________________')
print(len(emb_dict))
print(len(datum))
for fr, emb in emb_dict.items(): for fr, emb in emb_dict.items():
emb1 = np.linalg.norm(emb) emb1 = np.linalg.norm(emb)
emb2 = np.linalg.norm(datum[fr]) emb2 = np.linalg.norm(datum[fr])
# print( cosine_sim(emb, datum[fr])) print( cosine_sim(emb, datum[fr]))
print('Deepstream and Actual norm')
print(np.max(np.dot(arr_in_deepstream, normed.T),axis=1))
print('_________________________')
for dat in datum:
# print(cosine_sim(dat, datum[0]))
pass
#print(cosine_sim(datum[fr], datum[fr+1]))
#print(cosine_sim(emb_dict[fr], emb_dict[fr+1]))

View File

@@ -178,7 +178,7 @@ def embed_results_probe(pad, info, u_data, list_add, frame_num=0):
if True: if True:
for i in range(tensor_meta.num_output_layers): for i in range(tensor_meta.num_output_layers):
layer = pyds.get_nvds_LayerInfo(tensor_meta, i) layer = pyds.get_nvds_LayerInfo(tensor_meta, i)
if layer.layerName == "embedding": if layer.layerName == "output":
ptr = ctypes.cast( ptr = ctypes.cast(
pyds.get_ptr(layer.buffer), ctypes.POINTER(ctypes.c_float) pyds.get_ptr(layer.buffer), ctypes.POINTER(ctypes.c_float)
@@ -395,8 +395,8 @@ if True:
streammux_embed.set_property("width", target_width_embed) streammux_embed.set_property("width", target_width_embed)
streammux_embed.set_property("height", target_height_embed) streammux_embed.set_property("height", target_height_embed)
streammux_embed.set_property("batched-push-timeout", MUXER_BATCH_TIMEOUT_USEC) streammux_embed.set_property("batched-push-timeout", MUXER_BATCH_TIMEOUT_USEC)
streammux_embed.set_property("enable-padding", 1) streammux_embed.set_property("enable-padding", 0)
streammux_embed.set_property("batch-size", 4) streammux_embed.set_property("batch-size", 1)
nugget_embed = Gst.ElementFactory.make("nvinfer", "primary-inference") nugget_embed = Gst.ElementFactory.make("nvinfer", "primary-inference")
nugget_embed.set_property( nugget_embed.set_property(
@@ -426,7 +426,7 @@ if True:
# capsfilter.link(tee) # capsfilter.link(tee)
nvvidconv.link(tee) nvvidconv.link(tee)
if True: if False:
pipeline.add(queue_detect) pipeline.add(queue_detect)
pipeline.add(streammux_detect) pipeline.add(streammux_detect)
pipeline.add(nugget_detector) pipeline.add(nugget_detector)
@@ -442,7 +442,7 @@ if True:
os.environ["GST_DEBUG_DUMP_DOT_DIR"] = "/tmp" os.environ["GST_DEBUG_DUMP_DOT_DIR"] = "/tmp"
os.putenv("GST_DEBUG_DUMP_DIR_DIR", "/tmp") os.putenv("GST_DEBUG_DUMP_DIR_DIR", "/tmp")
if False: if True:
pipeline.add(queue_embed) pipeline.add(queue_embed)
pipeline.add(streammux_embed) pipeline.add(streammux_embed)
@@ -522,7 +522,13 @@ if True:
pass pass
# cleanup # cleanup
pipeline.set_state(Gst.State.NULL) pipeline.set_state(Gst.State.NULL)
# return detector_list, embed_list
# return detector_list, embed_list\\
out = [detector_list, embed_list ]
import json
with open("dump.json", "w") as ff:
json.dump([out[0], out[1]], ff)
sys.exit()
if __name__ == "__main__": if __name__ == "__main__":
@@ -537,6 +543,3 @@ if __name__ == "__main__":
import json import json
with open("dump.json", "w") as ff:
json.dump([out[0], out[1]], ff)
sys.exit()

File diff suppressed because one or more lines are too long

336
min_repro.py Normal file
View File

@@ -0,0 +1,336 @@
import io
import tensorrt as trt
import torch
import torch.nn as nn
import torch.nn.functional as F
class AttentionUsingScaledDotProduct(nn.Module):
"""
An alternative implementation of the Attention layer using `F.scaled_dot_product_attention`, which is ~50% faster,
but doesn't compile correctly when using TensorRT v10.
"""
def __init__(
self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
attn_head_dim=None,
):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
else:
self.q_bias = None
self.v_bias = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv_bias = None
if self.q_bias is not None:
qkv_bias = torch.cat(
(
self.q_bias,
torch.zeros_like(self.v_bias, requires_grad=False),
self.v_bias,
)
)
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0)
x = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.attn_drop.p if self.training else 0.0,
scale=self.scale,
)
x = x.transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
class ExplicitAttention(nn.Module):
"""
The explicit, original version of the Attention layer from the VideoMAEv2 codebase.
"""
def __init__(
self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
attn_head_dim=None,
):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
else:
self.q_bias = None
self.v_bias = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv_bias = None
if self.q_bias is not None:
qkv_bias = torch.cat(
(
self.q_bias,
torch.zeros_like(self.v_bias, requires_grad=False),
self.v_bias,
)
)
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0)
q = q * self.scale
attn = q @ k.transpose(-2, -1)
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
class AttentionUsingMHAForward(nn.Module):
def __init__(
self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
attn_head_dim=None,
):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
else:
self.q_bias = None
self.v_bias = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv_bias = None
if self.q_bias is not None:
qkv_bias = torch.cat(
(
self.q_bias,
torch.zeros_like(self.v_bias, requires_grad=False),
self.v_bias,
)
)
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0)
# MHA expects [sequence, batch, embed_dim].
x_t = x.transpose(0, 1) # => [N, B, C]
attn_out, _ = F.multi_head_attention_forward(
x_t,
x_t,
x_t,
embed_dim_to_check=C,
num_heads=self.num_heads,
# Since use_separate_proj_weight=False (default), then according to the docs:
# "in_proj_weight will be used, which is a combination of q_proj_weight, k_proj_weight, v_proj_weight."
in_proj_weight=self.qkv.weight,
in_proj_bias=qkv_bias,
bias_k=None,
bias_v=None,
add_zero_attn=False,
dropout_p=self.attn_drop.p,
out_proj_weight=self.proj.weight,
out_proj_bias=self.proj.bias,
training=self.training,
key_padding_mask=None,
need_weights=False,
attn_mask=None,
)
# Transpose back to [B, N, C].
x = attn_out.transpose(0, 1)
return x
def onnx_to_trt(onnx_bytes: bytes) -> bytes:
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network()
parser = trt.OnnxParser(network, TRT_LOGGER)
parser.parse(onnx_bytes)
config = builder.create_builder_config()
config.builder_optimization_level = 0
engine = builder.build_serialized_network(network, config)
return engine
def build_trt_module(model, x):
onnx_bytes = io.BytesIO()
torch.onnx.export(
model,
(x,),
onnx_bytes,
export_params=True,
opset_version=17,
do_constant_folding=True,
input_names=["x"],
output_names=["y"],
)
trt_engine = onnx_to_trt(onnx_bytes.getvalue())
return trt_engine
#@torch.inference_mode()
#def main():
with torch.no_grad():
torch.manual_seed(0)
EMB_DIM = 384
x = torch.rand((6, 1568, EMB_DIM))
explicit_attention = ExplicitAttention(EMB_DIM)
sdpa = AttentionUsingScaledDotProduct(EMB_DIM)
mha_fwd = AttentionUsingMHAForward(EMB_DIM)
# Use the same params for all.
sdpa.load_state_dict(explicit_attention.state_dict())
mha_fwd.load_state_dict(explicit_attention.state_dict())
sdpa_torch_y = sdpa(x)
explicit_attention_torch_y = explicit_attention(x)
mha_fwd_torch_y = mha_fwd(x)
print(
"Torch: [explicit<->sdpa] Is allclose?",
sdpa_torch_y.allclose(explicit_attention_torch_y, atol=0.0001),
)
print(
"Torch: [explicit<->mha_fwd] Is allclose?",
mha_fwd_torch_y.allclose(explicit_attention_torch_y, atol=0.0001),
)
print(
"Torch: [explicit<->sdpa] Total difference:",
(sdpa_torch_y - explicit_attention_torch_y).abs().sum(),
)
print(
"Torch: [explicit<->mha_fwd] Total difference:",
(mha_fwd_torch_y - explicit_attention_torch_y).abs().sum(),
)
assert sdpa_torch_y.allclose(explicit_attention_torch_y, atol=0.0001), "Precheck"
assert mha_fwd_torch_y.allclose(explicit_attention_torch_y, atol=0.0001), "Precheck"
# %%
explicit_attention_trt = build_trt_module(explicit_attention, x)
with open('explicit_attention_trt.trt','wb') as ea:
ea.write(explicit_attention_trt)
sdpa_trt_model = build_trt_module(sdpa, x)
with open('sdpa_trt.trt','wb') as ea:
ea.write(sdpa_trt_model)
mha_fwd_trt_model = build_trt_module(mha_fwd, x)
with open('mha_trt.trt','wb') as ea:
ea.write(mha_fwd_trt_model)
# %%
# %%
explicit_attention_y = explicit_attention_trt(x.cuda())
sdpa_y = sdpa_trt_model(x.cuda())
mha_fwd_y = mha_fwd_trt_model(x.cuda())
print(
"TRT: [explicit<->sdpa] Is allclose?",
sdpa_y.allclose(explicit_attention_y, atol=0.0001),
)
print(
"TRT: [explicit<->sdpa] Total difference:",
(sdpa_y - explicit_attention_y).abs().sum(),
)
print(
"TRT: [explicit<->mha_fwd] Is allclose?",
mha_fwd_y.allclose(explicit_attention_y, atol=0.0001),
)
print(
"TRT: [explicit<->mha_fwd] Total difference:",
(mha_fwd_y - explicit_attention_y).abs().sum(),
)
print("TRT: Explicit Attention:", explicit_attention_y[0, 0, :32])
print("TRT: Scaled Dot Product Attention:", sdpa_y[0, 0, :32])
print("TRT: MHA Forward:", mha_fwd_y[0, 0, :32])
if __name__ == "__main__":
main()

334
ml_run.py Normal file
View File

@@ -0,0 +1,334 @@
import sys
sys.path.insert(0, "/home/thebears/source/models/yolov7")
import time
from datetime import datetime
import cv2
import numpy as np
from pymediainfo import MediaInfo
import inspect
import open_clip
import sys
import torch
import yaml
from models.experimental import attempt_load
from utils.general import check_img_size, non_max_suppression
from torchvision import transforms
device = torch.device("cuda")
pretrained_name = "webli"
#model_name = "ViT-SO400M-16-SigLIP2-512"
#model_name = 'ViT-SO400M-14-SigLIP-384'
clip_model, _, clip_preprocess_og = open_clip.create_model_and_transforms(
model_name, pretrained=pretrained_name
)
tokenizer = open_clip.get_tokenizer('hf-hub:timm/'+model_name)
labels_list = ["A bird with a brown head and black body", "A bird with a black head and black body"]
text = tokenizer(labels_list, context_length=clip_model.context_length)
import torch.nn.functional as F
with torch.no_grad():
text_features = clip_model.encode_text(text).detach().cpu()
text_features = F.normalize(text_features, dim=-1).detach().cpu()
# %%
clip_model = clip_model.half().to(device)
clip_dtype = next(clip_model.parameters()).dtype
clip_img_size = clip_preprocess_og.transforms[0].size
_ = clip_model.encode_image(
torch.rand(1, 3, *clip_img_size, dtype=clip_dtype, device=device)
)
clip_preprocess = transforms.Compose([clip_preprocess_og.transforms[x] for x in [0, 3]])
det_root_path = "/home/thebears/source/model_weights"
det_model_weights_root = os.path.join(det_root_path, "yolov7")
det_model_weights_path = os.path.join(det_model_weights_root, "best.pt")
det_data_yaml_path = os.path.join(det_model_weights_root, "inaturalist.yaml")
det_model = attempt_load(det_model_weights_path, map_location=device)
det_model = det_model.half().to(device)
det_dtype = next(det_model.parameters()).dtype
det_imgsz = 1280
det_stride = int(det_model.stride.max())
det_imgsz = check_img_size(det_imgsz, s=det_stride)
_ = det_model(torch.zeros(1, 3, det_imgsz, det_imgsz, dtype=det_dtype).to(device))
with open(det_data_yaml_path, "r") as ff:
det_model_info = yaml.safe_load(ff)
det_labels = det_model_info["names"]
array_score = clip_array
frame_numbers = [x[0] for x in array_score]
frame_values = [x[1] for x in array_score]
frame_as_tensor = (
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
.to(torch.float16)
.to(device)
.permute([0, 3, 1, 2])
)
def score_frames_det(array_score):
frame_numbers = [x[0] for x in array_score]
frame_values = [x[1] for x in array_score]
frame_as_tensor = (
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
.to(torch.float16)
.to(device)
.permute([0, 3, 1, 2])
)
with torch.no_grad():
frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
det_preds = det_model(frame_for_model)[0]
det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
# frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:])
# clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
return {"det": det_cpu_pred, "fr#": frame_numbers}
def score_frames_clip(array_score):
frame_numbers = [x[0] for x in array_score]
frame_values = [x[1] for x in array_score]
frame_as_tensor = (
torch.from_numpy(np.stack(frame_values)[:, :, :, 0:3])
.to(torch.float16)
.to(device)
.permute([0, 3, 1, 2])
)
with torch.no_grad():
# frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
# det_preds = det_model(frame_for_model)[0]
# det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
# det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
frame_for_clip = clip_preprocess(frame_as_tensor[:,[0,1,2],:,:])
clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
return {"clip": clip_pred, "fr#": frame_numbers}
with torch.no_grad():
frame_for_model = det_vid_preprocess(frame_as_tensor).div(255)[:,[2,1,0],:,:]
det_preds = det_model(frame_for_model)[0]
det_pred_post_nms = non_max_suppression(det_preds,0.25, 0.5)
det_cpu_pred = [x.detach().cpu().numpy() for x in det_pred_post_nms]
frame_for_clip = frame_as_tensor.div(255)
frame_for_clip = clip_preprocess(frame_for_clip[:,(2,1,0),:,:])
clip_pred = clip_model.encode_image(frame_for_clip).detach().cpu().numpy()
score_result = {"det": det_cpu_pred, "clip": clip_pred, "fr#": frame_numbers}
clip_orin = F.normalize(torch.from_numpy(score_result['clip']))
clip_tree = F.normalize(torch.from_numpy(saved_emb))
print(np.dot(clip_tree, clip_orin.T))
mvo = mean_vec_out[0]
ooo = frame_for_clip[0].cpu().numpy()
plt.close('all')
fig = plt.figure()
ax1 = fig.add_subplot(3,2,1)
ax1.imshow(mvo[0])
ax2 = fig.add_subplot(3,2,2)
ax2.imshow(ooo[0])
ax3 = fig.add_subplot(3,2,3)
ax3.imshow(mvo[1])
ax4 = fig.add_subplot(3,2,4)
ax4.imshow(ooo[1])
ax5 = fig.add_subplot(3,2,5)
ax5.imshow(mvo[2])
ax6 = fig.add_subplot(3,2,6)
ax6.imshow(ooo[2])
fig.show()
# %%
raw_vec_out
mean_vec_out
# %%
file_to_score = "/home/thebears/source/ml_code/short.mp4"
vec_file = '/home/thebears/source/ml_code/short.npz'
out = np.load(vec_file)
mean_vec_path = '/home/thebears/source/ml_code/as_np_mean.npy'
mean_vec_out = np.load(mean_vec_path)
raw_vec_path = '/home/thebears/source/ml_code/as_np_raw.npy'
raw_vec_out = np.load(raw_vec_path)
saved_fr = list(out['frame_numbers'])
saved_emb = out['embeds']
import numpy as np
def get_video_info(file_path):
file_info = MediaInfo.parse(file_path)
video_info = None
frame_count = 0
if len(file_info.video_tracks) > 0: video_info = file_info.video_tracks[0]
video_info.frame_count = int(video_info.frame_count)
return video_info
video_info = get_video_info(file_to_score)
vid_decoder = "h264parse"
if video_info.format.lower() == "HEVC".lower():
vid_decoder = "h265parse"
gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false".format(
file_to_score=file_to_score, vid_decoder=vid_decoder
)
# gst_cmd = "filesrc location={file_to_score} ! qtdemux name=demux demux.video_0 ! queue ! {vid_decoder} ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! appsink sync=false".format(file_to_score=file_to_score, vid_decoder=vid_decoder)
cap_handle = cv2.VideoCapture(gst_cmd, cv2.CAP_GSTREAMER)
target_max = det_imgsz
vid_h = video_info.height
vid_w = video_info.width
if vid_h > vid_w:
target_h = target_max
target_w = target_max * vid_w / vid_h
elif vid_h == vid_w:
target_h = target_max
target_w = target_max
elif vid_h < vid_w:
target_h = target_max * vid_h / vid_w
target_w = target_max
target_h = int(target_h)
target_w = int(target_w)
pad_amt = [None, None, None, None]
if target_w % det_stride != 0:
off = det_stride - target_w % det_stride
new_w = target_w + off
pad_diff = new_w - target_w
pad_left = round(pad_diff / 2)
pad_right = pad_diff - pad_left
pad_amt[0] = pad_left
pad_amt[2] = pad_right
else:
pad_amt[0] = 0
pad_amt[2] = 0
if target_h % det_stride != 0:
off = det_stride - target_h % det_stride
new_h = target_h + off
pad_diff = new_h - target_h
pad_up = round(pad_diff / 2)
pad_down = pad_diff - pad_up
pad_amt[1] = pad_up
pad_amt[3] = pad_down
else:
pad_amt[1] = 0
pad_amt[3] = 0
det_vid_preprocess = transforms.Compose(
[transforms.Resize((target_h, target_w)), transforms.Pad(pad_amt, fill=127)]
)
batch_size = 6
clip_interval = 10
array_score = list()
final_output = dict()
final_output["start_score_time"] = time.time()
final_output["num_frames"] = video_info.frame_count
st = time.time()
frame_numbers = list()
det_results = list()
clip_results = list()
clip_array = list()
for i in range(video_info.frame_count):
success, frame_matrix = cap_handle.read()
clip_array.append((i, frame_matrix))
if not success:
break
array_score.append((i, frame_matrix))
if len(array_score) >= batch_size:
score_result = score_frames(array_score)
det_results.extend(score_result["det"])
clip_results.extend(score_result["clip"])
frame_numbers.extend(score_result["fr#"])
array_score = list()
if not (i % clip_interval):
print('do_clip')
if len(array_score) > 0:
score_result = score_frames(array_score)
det_results.extend(score_result["det"])
clip_results.extend(score_result["clip"])
frame_numbers.extend(score_result["fr#"])
cap_handle.release()
et = time.time()
final_output["end_score_time"] = time.time()
final_output["video"] = {
"w": vid_w,
"h": vid_h,
"path": file_to_score,
"target_w": target_w,
"target_h": target_h,
"pad_amt": pad_amt,
}
try:
final_output['scoring_fps'] = final_output['num_frames']/ (final_output['end_score_time'] - final_output['start_score_time'])
except Exception as e:
pass
final_output['scores'] = list()
for frame_number, frame in zip(frame_numbers, det_results):
cframe_dict = dict()
cframe_dict['frame'] = frame_number
cframe_dict['score_number'] = frame_number
cframe_dict['detections'] = list()
for det in frame:
data = dict()
data['coords'] = [float(x) for x in list(det[0:4])]
data['score'] = float(det[4])
data['idx'] = int(det[5])
try:
data['name'] = det_labels[data['idx']]
except:
data['name'] = 'Code failed'
cframe_dict['detections'].append(data)
final_output['scores'].append(cframe_dict)

BIN
orin.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 288 KiB

95638
report_dynamo_export.sarif Normal file

File diff suppressed because one or more lines are too long

BIN
saved.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 259 KiB

BIN
short.npz Executable file

Binary file not shown.

View File

@@ -1,4 +1,5 @@
import time import time
from datetime import datetime
import cv2 import cv2
import numpy import numpy
@@ -12,7 +13,7 @@ import torch
from cuda import cuda as ccuda from cuda import cuda as ccuda
from cuda import cudart from cuda import cudart
cmd = "filesrc location=/home/thebears/local/source/full.mp4 ! qtdemux name=demux demux.video_0 ! queue ! h265parse ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false" cmd = "filesrc location=/home/thebears/local/source/short.mp4 ! qtdemux name=demux demux.video_0 ! queue ! h265parse ! nvv4l2decoder ! nvvidconv ! videoscale method=1 add-borders=false ! video/x-raw,width=1280,height=1280 ! appsink sync=false"
cap = cv2.VideoCapture(cmd, cv2.CAP_GSTREAMER) cap = cv2.VideoCapture(cmd, cv2.CAP_GSTREAMER)
@@ -21,6 +22,7 @@ fr = 0
arrays_to_score = list() arrays_to_score = list()
imgs = list()
array = list() array = list()
while True: while True:
good, frf = cap.read() good, frf = cap.read()
@@ -31,11 +33,11 @@ while True:
array.append(frf) array.append(frf)
imgs.append(frf)
if len(array) > 8: if len(array) > 8:
arrays_to_score.append(torch.from_numpy(np.asarray(array))) arrays_to_score.append(torch.from_numpy(np.asarray(array)))
array = list() array = list()
break
if len(array) > 0: if len(array) > 0:
@@ -45,55 +47,84 @@ if len(array) > 0:
et = time.time() et = time.time()
print(et - st, fr / (st - et)) print(et - st, fr / (st - et))
# %%
from datetime import datetime
pretrained_name = "webli"
#model_name = "ViT-L-16-SigLIP2-512"
model_name = 'ViT-SO400M-16-SigLIP2-512'
rt_dir ='/home/thebears/local/source/models/'
os.makedirs(rt_dir, exist_ok=True)
fname = model_name.replace('-','_').lower() + '_'+datetime.now().strftime('%Y%m%d')
ONNX_FILE_PATH=os.path.join(rt_dir, fname + '.onnx')
ENGINE_FILE_PATH = os.path.splitext(ONNX_FILE_PATH)[0]+'.engine'
# %% # %%
pretrained_name = "webli"
model_name = "ViT-L-16-SigLIP-512"
model_name = 'ViT-SO400M-16-SigLIP2-512'
ONNX_FILE_PATH = "/home/thebears/local/source/so400m_siglip2_512.onnx"
#model_name, pretrained_name = ('ViT-B-16-quickgelu', 'openai')
model, _, preprocess = open_clip.create_model_and_transforms( model, _, preprocess = open_clip.create_model_and_transforms(
model_name, pretrained=pretrained_name model_name, pretrained=pretrained_name
) )
# %%
model_gpu = model.cuda()
scores = list()
all_means = list()
with torch.no_grad():
for fr_num, img in enumerate(imgs):
tensor_raw = torch.tensor(img[None,:,:,0:3])
tensor_perm = tensor_raw.permute([0, 3, 1, 2]).to(torch.float32) / 255
tensor_reshaped = preprocess.transforms[0](tensor_perm)
tensor_mean = preprocess.transforms[-1](tensor_reshaped)
all_means.append(tensor_mean)
imp = model_gpu.encode_image(tensor_mean.cuda())
print(fr_num)
scores.append((fr_num, imp.detach().cpu().numpy()))
# %%
np.save('dump_so400m',np.concatenate([x[1] for x in scores]))
# %% # %%
with torch.no_grad(): with torch.no_grad():
et = time.time() et = time.time()
if True: if True:
tensor_raw = arrays_to_score[0][0,:,:,0:3][None,:,:,:]
tensor_raw = torch.concat(arrays_to_score)[0:4, :, :, 0:3]
tensor_perm = tensor_raw.permute([0, 3, 1, 2]).to(torch.float32) / 255 tensor_perm = tensor_raw.permute([0, 3, 1, 2]).to(torch.float32) / 255
tensor_reshaped = preprocess.transforms[0](tensor_perm) tensor_reshaped = preprocess.transforms[0](tensor_perm)
tensor_mean = preprocess.transforms[-1](tensor_reshaped) tensor_mean = preprocess.transforms[-1](tensor_reshaped)
else: else:
tensor_raw = torch.concat(arrays_to_score)[0:4, :, :, 0:3] tensor_raw = torch.concat(arrays_to_score)[0, :, :, 0:3]
tensor_perm = tensor_raw.permute([0, 3, 1, 2]).to(torch.float32) / 255 tensor_perm = tensor_raw.permute([0, 3, 1, 2]).to(torch.float32) / 255
tensor_reshaped = preprocess.transforms[1](preprocess.transforms[0](tensor_perm)) tensor_reshaped = preprocess.transforms[1](preprocess.transforms[0](tensor_perm))
tensor_mean = preprocess.transforms[-1](tensor_reshaped) tensor_mean = preprocess.transforms[-1](tensor_reshaped)
imp = model.encode_image(tensor_mean) #imp = model.encode_image(tensor_mean)
imp = model_gpu.encode_image(tensor_mean.cuda())
st = time.time() st = time.time()
print((st - et) / tensor_raw.shape[0], tensor_raw.shape[0]/(st - et) ) print((st - et) / tensor_raw.shape[0], tensor_raw.shape[0]/(st - et) )
from_model_on_gpu = imp.cpu().numpy() from_model_on_gpu = imp.detach().cpu().numpy()
# %% # %%
ENGINE_FILE_PATH = os.path.splitext(ONNX_FILE_PATH)[0]+'.trt'
torch.onnx.export( torch.onnx.export(
model.visual, model.visual.cuda(),
tensor_mean, tensor_mean.cuda(),
ONNX_FILE_PATH, ONNX_FILE_PATH,
input_names=["input"], input_names=["input"],
output_names=["output"], output_names=["output"],
) )
# %%
X_test = tensor_mean.cpu().numpy() X_test = tensor_mean.cpu().numpy()
sess = rt.InferenceSession( sess = rt.InferenceSession(
ONNX_FILE_PATH, providers=rt.get_available_providers()) ONNX_FILE_PATH, providers=rt.get_available_providers())
@@ -106,7 +137,7 @@ def norm(v):
print(np.dot(norm(pred_onx), norm(from_model_on_gpu).T)) print(np.dot(norm(pred_onx), norm(from_model_on_gpu).T))
# %%
TRT_LOGGER = trt.Logger() TRT_LOGGER = trt.Logger()
def build_engine_from_onnx(onnx_file_path, use_fp16=True): def build_engine_from_onnx(onnx_file_path, use_fp16=True):
""" """
@@ -142,7 +173,7 @@ def build_engine_from_onnx(onnx_file_path, use_fp16=True):
# Enable FP16 precision if requested and if the GPU supports it # Enable FP16 precision if requested and if the GPU supports it
if use_fp16: if use_fp16:
if builder.platform_has_fast_fp16: if builder.platform_has_fast_fp16:
# config.set_flag(trt.BuilderFlag.FP16) config.set_flag(trt.BuilderFlag.FP16)
print("FP16 enabled successfully") print("FP16 enabled successfully")
else: else:
print("Warning: GPU doesn't support fast FP16, using FP32 instead") print("Warning: GPU doesn't support fast FP16, using FP32 instead")
@@ -160,7 +191,7 @@ def build_engine_from_onnx(onnx_file_path, use_fp16=True):
engine = build_engine_from_onnx(ONNX_FILE_PATH, use_fp16=False) engine = build_engine_from_onnx(ONNX_FILE_PATH, use_fp16=True)
with open(ENGINE_FILE_PATH, "wb") as f: with open(ENGINE_FILE_PATH, "wb") as f:
f.write(engine) f.write(engine)