Files
SearchInterface/SearchScratch/test_seek.py
2025-09-17 12:03:14 -04:00

183 lines
5.3 KiB
Python

# %%
import sys, os
sys.path.append("/home/thebears/Web/Nuggets/SearchInterface/SearchUtil")
sys.path.append("/home/thebears/Web/Nuggets/SearchInterface/VectorService/util")
import embed_scores as ES
# %%
query = 'Cat and human'
c_dir = '/srv/ftp_tcc/leopards1/2025/09/08'
threshold=0.10
results = ES.calculate_embedding_score_in_folder(c_dir, threshold, query)
print(len(results['videos']))
# %%
c_dir = '/srv/ftp_tcc/leopards1/2025/09/08'
query_vector = None
og_dir = c_dir
if query_vector is None:
query_vector = ES.get_query_vector(query)
candidate_dirs = list()
candidate_dirs.append(og_dir)
candidate_dirs.append(og_dir.replace('/srv/ftp_tcc','/mnt/hdd_24tb_1/videos/ftp'))
candidate_dirs.append(og_dir.replace('/srv/ftp','/mnt/hdd_24tb_1/videos/ftp'))
c_dir = None
for candidate in candidate_dirs:
if os.path.exists(candidate):
c_dir = candidate
break
if c_dir is None:
# return []
pass
from embed_scores import *
redis_key = 'helllo'
vec_cache_str = md5(query_vector).hexdigest()
cache_file_loc = os.path.join(c_dir, 'embedding_scores@'+str(threshold)+'@'+vec_cache_str+'.pkl')
vec_rep = get_vector_representation(c_dir, redis_key = redis_key)
query_scores = get_scores_embedding_c_dir(c_dir, tuple(query_vector.tolist()[0]))
video_json_info = list()
idces_keep = np.where(query_scores > threshold)[0]
video_id = vec_rep['idces'][idces_keep]
videos_that_match = np.unique(video_id)
id_extract_video_level = np.where(np.isin(vec_rep['idces'], videos_that_match))[0]
idces_split = np.where(np.diff(vec_rep['idces'][id_extract_video_level]) !=0)[0] + 1
subset_timestampsF = np.split(vec_rep['timestamps'][id_extract_video_level], idces_split)
for idx, subset_t in enumerate(subset_timestampsF):
if len(subset_t) == 0:
continue
min_t = min(subset_t)
max_t = max(subset_t)
print(idx, max_t - min_t)
idces_curr = np.where(np.logical_and(vec_rep['timestamps'] > min_t , vec_rep['timestamps'] < max_t))[0]
if len(idces_curr) == 0:
continue
unq_vids = np.unique(vec_rep['idces'][idces_curr])
subset_idx = np.where(np.isin(vec_rep['idces'],unq_vids))[0]
subset_idces = vec_rep['idces'][subset_idx]
subset_timestamps = vec_rep['timestamps'][subset_idx]
subset_scores = query_scores[subset_idx]
idx_split = np.where(np.diff(vec_rep['idces'][subset_idx]) !=0)[0]+1
split_idces = np.split(subset_idces, idx_split)
split_timestamps = np.split(subset_timestamps, idx_split)
split_scores = np.split(subset_scores, idx_split)
split_files = [vec_rep['source_files'][x[0]] for x in split_idces]
for s_file, s_scores, s_tstamps, s_idces in zip(split_files, split_scores, split_timestamps, split_idces):
start_time = float(min(s_tstamps))
end_time = float(max(s_tstamps))
frame_time = (s_tstamps - start_time).tolist()
embed_scores = s_scores.tolist()
c_data = {'file_name': str(s_file), 'start_time':start_time, 'end_time':end_time, 'embed_scores':{'time':frame_time, 'score':embed_scores}}
video_json_info.append(c_data)
print(len(video_json_info))
# %%
query = 'A cat and a human'
c_dirs = ['/mnt/hdd_24tb_1/videos/ftp/leopards2/2025/08/26','/srv/ftp_tcc/leopards1/2025/08/27','/srv/ftp_tcc/leopards1/2025/08/28','/srv/ftp_tcc/leopards1/2025/08/29']
threshold = 0.10
folder_scores = ES.calculate_embedding_score_in_folders( tuple(c_dirs), threshold = threshold, query = query )
folder_scores['breaks'] = ES.add_breaks_between_videos(folder_scores)
# %%
target_tstamp = 1756332686.5805347
matching_file = None
for video_file in folder_scores['videos']:
start_time = video_file['start_time']
end_time = video_file['end_time']
if target_tstamp > start_time and target_tstamp < end_time:
matching_file = video_file
if matching_file is not None:
fname = video_file['file_name']
offset = target_tstamp - start_time
pelse:
fname = 'None Found'
offset = -1
web_name = os.path.basename(fname)
# %%
import embed_scores as ES
result = ES.get_matching_file_for_tstamp(target_tstamp + 500, folder_scores)
print(result)
# %%
import requests
folder_scores = requests.get('http://192.168.1.242:5004/videos.json').json()
print(len(
# %%
folder_scores = requests.get('http://192.168.1.242:5004/videos.json', params={'threshold':0.09}).json()
print(len(folder_scores['videos']))
# %%
new_folder_scores = folder_scores.copy()
import lttb
min_rows = 15
factor = 0.1
for x in new_folder_scores['videos']:
data = np.asarray( [x['embed_scores']['time'], x['embed_scores']['score']])
amt = max(min_rows, int(factor*data.shape[1]))
if data.shape[1] > amt:
sampled = lttb.downsample(data.T, amt)
else:
sampled = data.T
time = sampled[:,0].tolist()
scores = sampled[:,1].tolist()
# %%
import pickle
cache_file_loc = '/srv/ftp_tcc/leopards1/2025/09/09/embedding_scores@0.1@de376b3b6e90315477571ef6e82e841c.pkl'
c_dir = os.path.dirname(cache_file_loc)
# %%
with open(cache_file_loc,'rb') as f:
video_json_info = pickle.load(f)
files_in_cache = {os.path.splitext(os.path.basename(x['file_name']))[0] for x in video_json_info}
lsd_dir = os.listdir(c_dir)
files_on_disk = {x.split('.')[0] for x in lsd_dir if x.endswith('oclip_embeds.npz')}
print(len(files_on_disk), len(files_in_cache))
p# %%
import embed_scores as ES
a_mov = '/srv/ftp_tcc/leopards1/2025/09/09/Leopards1_00_20250909045221.mp4'