SearchInterface/SearchScratch/test_seek.py

# %%
import sys, os
sys.path.append("/home/thebears/Web/Nuggets/SearchInterface/SearchUtil")
sys.path.append("/home/thebears/Web/Nuggets/SearchInterface/VectorService/util")
import embed_scores as ES
# %%
query = 'Cat and human'
c_dir = '/srv/ftp_tcc/leopards1/2025/09/08'
threshold=0.10

results = ES.calculate_embedding_score_in_folder(c_dir, threshold, query)
print(len(results['videos']))

# %%
c_dir = '/srv/ftp_tcc/leopards1/2025/09/08'
query_vector = None
og_dir = c_dir

if query_vector is None:
    query_vector = ES.get_query_vector(query)

candidate_dirs = list()
candidate_dirs.append(og_dir)
candidate_dirs.append(og_dir.replace('/srv/ftp_tcc','/mnt/hdd_24tb_1/videos/ftp'))
candidate_dirs.append(og_dir.replace('/srv/ftp','/mnt/hdd_24tb_1/videos/ftp'))

c_dir = None
for candidate in candidate_dirs:
    if os.path.exists(candidate):
        c_dir = candidate
        break
if c_dir is None:
#    return []
    pass
from embed_scores import *
redis_key = 'helllo'
vec_cache_str = md5(query_vector).hexdigest()
cache_file_loc = os.path.join(c_dir, 'embedding_scores@'+str(threshold)+'@'+vec_cache_str+'.pkl')


vec_rep = get_vector_representation(c_dir, redis_key = redis_key)
query_scores = get_scores_embedding_c_dir(c_dir, tuple(query_vector.tolist()[0]))

video_json_info = list()
idces_keep = np.where(query_scores > threshold)[0]

video_id = vec_rep['idces'][idces_keep]
videos_that_match = np.unique(video_id)

id_extract_video_level = np.where(np.isin(vec_rep['idces'], videos_that_match))[0]

idces_split = np.where(np.diff(vec_rep['idces'][id_extract_video_level]) !=0)[0] + 1
subset_timestampsF = np.split(vec_rep['timestamps'][id_extract_video_level], idces_split)


for idx, subset_t in enumerate(subset_timestampsF):
    if len(subset_t) == 0:
        continue

    min_t = min(subset_t)
    max_t = max(subset_t)
    print(idx,  max_t - min_t)
    idces_curr = np.where(np.logical_and(vec_rep['timestamps'] > min_t , vec_rep['timestamps'] < max_t))[0]
    if len(idces_curr) == 0:
        continue

    unq_vids = np.unique(vec_rep['idces'][idces_curr])
    subset_idx = np.where(np.isin(vec_rep['idces'],unq_vids))[0]

    subset_idces = vec_rep['idces'][subset_idx]
    subset_timestamps = vec_rep['timestamps'][subset_idx]
    subset_scores = query_scores[subset_idx]
    idx_split = np.where(np.diff(vec_rep['idces'][subset_idx]) !=0)[0]+1

    split_idces = np.split(subset_idces, idx_split)
    split_timestamps = np.split(subset_timestamps, idx_split)
    split_scores = np.split(subset_scores, idx_split)
    split_files = [vec_rep['source_files'][x[0]] for x in split_idces]

    for s_file, s_scores, s_tstamps, s_idces in zip(split_files, split_scores, split_timestamps, split_idces):
        start_time = float(min(s_tstamps))
        end_time = float(max(s_tstamps))

        frame_time = (s_tstamps - start_time).tolist()
        embed_scores = s_scores.tolist()

        c_data = {'file_name': str(s_file), 'start_time':start_time, 'end_time':end_time, 'embed_scores':{'time':frame_time, 'score':embed_scores}}
        video_json_info.append(c_data)


print(len(video_json_info))

# %%
query = 'A cat and a human'
c_dirs = ['/mnt/hdd_24tb_1/videos/ftp/leopards2/2025/08/26','/srv/ftp_tcc/leopards1/2025/08/27','/srv/ftp_tcc/leopards1/2025/08/28','/srv/ftp_tcc/leopards1/2025/08/29']

threshold = 0.10
folder_scores =  ES.calculate_embedding_score_in_folders( tuple(c_dirs), threshold = threshold, query = query )
folder_scores['breaks'] = ES.add_breaks_between_videos(folder_scores)
# %%
target_tstamp = 1756332686.5805347


matching_file = None
for video_file in folder_scores['videos']:
    start_time = video_file['start_time']
    end_time = video_file['end_time']

    if target_tstamp > start_time and target_tstamp < end_time:
        matching_file = video_file

if matching_file is not None:
    fname = video_file['file_name']
    offset = target_tstamp - start_time
pelse:
    fname = 'None Found'
    offset = -1

web_name = os.path.basename(fname)
# %%


import embed_scores as ES

result = ES.get_matching_file_for_tstamp(target_tstamp + 500, folder_scores)
print(result)
# %%

import requests
folder_scores = requests.get('http://192.168.1.242:5004/videos.json').json()
print(len(

# %%
folder_scores = requests.get('http://192.168.1.242:5004/videos.json', params={'threshold':0.09}).json()
print(len(folder_scores['videos']))
# %%

new_folder_scores = folder_scores.copy()
import lttb
min_rows = 15
factor = 0.1
for x in new_folder_scores['videos']:
    data = np.asarray( [x['embed_scores']['time'], x['embed_scores']['score']])
    amt = max(min_rows, int(factor*data.shape[1]))

    if data.shape[1] > amt:
        sampled = lttb.downsample(data.T, amt)
    else:
        sampled = data.T

    time = sampled[:,0].tolist()
    scores = sampled[:,1].tolist()


# %%

import pickle
cache_file_loc = '/srv/ftp_tcc/leopards1/2025/09/09/embedding_scores@0.1@de376b3b6e90315477571ef6e82e841c.pkl'
c_dir = os.path.dirname(cache_file_loc)


# %%
with open(cache_file_loc,'rb') as f:
    video_json_info = pickle.load(f)


files_in_cache = {os.path.splitext(os.path.basename(x['file_name']))[0] for x in video_json_info}
lsd_dir = os.listdir(c_dir)
files_on_disk = {x.split('.')[0] for x in lsd_dir if x.endswith('oclip_embeds.npz')}
print(len(files_on_disk), len(files_in_cache))


p# %%
import embed_scores as ES
a_mov = '/srv/ftp_tcc/leopards1/2025/09/09/Leopards1_00_20250909045221.mp4'