common code

This commit is contained in:
2025-09-17 12:03:14 -04:00
parent 090af0f477
commit 50376f71a8
22 changed files with 3145 additions and 390 deletions

162
VectorService/util/.gitignore vendored Normal file
View File

@@ -0,0 +1,162 @@
### Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View File

@@ -0,0 +1,11 @@
{
"folders": [
{
"path": "../../../../../Seafile/Designs/Code/Python/CommonCode"
},
{
"path": "../.."
}
],
"settings": {}
}

View File

@@ -0,0 +1,339 @@
from CommonCode.video_meta import FTPVideo
from CommonCode.settings import get_logger
import logging
import json
import datetime as dt
import functools
import requests
import numpy as np
from pqdm.processes import pqdm
from multiprocessing import Pool
import os
import lttb
import pickle
import redis
from hashlib import md5
r = redis.Redis(host='localhost', port=6379, db=15)
logger = get_logger(__name__,'/var/log/vector_search_logs/util_embed_scores', stdout=True, systemd=False, level = logging.INFO)
def get_matching_file_for_tstamp(target_tstamp, folder_scores):
matching_file = None
for video_file in folder_scores['videos']:
start_time = video_file['start_time']
end_time = video_file['end_time']
if target_tstamp > start_time and target_tstamp < end_time:
matching_file = video_file
if matching_file is not None:
fname = matching_file['file_name']
offset = target_tstamp - matching_file['start_time']
else:
fname = 'None Found'
offset = -1
web_name = 'media/'+os.path.basename(fname)
return dict(full_path = fname, path=web_name, timeoffset = offset)
def get_vec_rep_file_loc(c_dir):
vec_rep_file = os.path.join(c_dir, 'vec_rep.npz')
return vec_rep_file
def get_vector_representation(c_dir, force_compute = False, redis_key = 'compute_log'):
message = {'task':'VECTOR_CALC_IN_FOLDER_START', 'when': str(c_dir), 'time': dt.datetime.now().timestamp()}
r.rpush(redis_key, json.dumps(message))
vec_rep_file = get_vec_rep_file_loc(c_dir)
if os.path.exists(vec_rep_file) and not force_compute:
try:
result = dict(np.load(vec_rep_file))
message = {'task':'VECTOR_CALC_IN_FOLDER_DONE', 'when': str(c_dir), 'time': dt.datetime.now().timestamp(), 'precomputed':True}
r.rpush(redis_key, json.dumps(message))
return result
except:
os.remove(vec_rep_file)
ff = list()
for root, dirs, files in os.walk(c_dir):
for f in files:
if f.endswith('.mp4') and '_reduced' not in f:
ff.append(os.path.join(root, f))
videos = list()
for x in ff:
cvid = FTPVideo(x)
videos.append(FTPVideo(x))
sorted_videos = sorted(videos)
all_cat = list()
all_idx = list()
all_source = list()
all_tstamps = list()
enu = 0
for idx, x in enumerate(sorted_videos):
try:
hh = x.embeddings
except Exception as e:
hh = None
if hh is not None:
n_emb = FTPVideo.vec_norm(hh['embeds'])
all_cat.append(n_emb)
all_idx.append( enu * np.ones(n_emb.shape[0], dtype=np.int64) )
all_source.append(x.real_path)
all_tstamps.append( [x.timestamp() for x in hh['frame_time']])
enu +=1
message = {'task':'VECTOR_CALC_IN_FOLDER_BUMP', 'progress': idx+1, 'how_many': len(sorted_videos), 'time': dt.datetime.now().timestamp()}
r.rpush(redis_key, json.dumps(message))
if len(all_cat) == 0:
return []
all_embeds = np.vstack(all_cat)
all_embeds = FTPVideo.vec_norm(all_embeds)
all_idces = np.hstack(all_idx)
all_times = np.hstack(all_tstamps)
np.savez(vec_rep_file, embeds = all_embeds, idces= all_idces, timestamps = all_times, source_files = all_source)
message = {'task':'VECTOR_CALC_IN_FOLDER_DONE', 'when': str(c_dir), 'time': dt.datetime.now().timestamp()}
return dict( embeds = all_embeds, idces= all_idces, timestamps = all_times, source_files = all_source)
def get_scores_embedding_c_dir(c_dir, query_vector, redis_key = 'compute_log'):
vec_rep = get_vector_representation(c_dir, redis_key=redis_key)
query_scores = (query_vector @ vec_rep['embeds'].T).squeeze()
return query_scores
@functools.lru_cache
def get_query_vector(query):
vec_form = requests.get('http://192.168.1.242:53004/encode',params={'query':query}).json()['vector'][0]
vec_search = np.asarray(vec_form)
query_vector = FTPVideo.vec_norm(vec_search[None,:])
return query_vector
def calculate_embedding_score_in_folders(c_dirs, threshold, query = None, query_vector = None, redis_key = 'compute_log'):
result_list = list()
query_vector = None
if query_vector is None:
query_vector = get_query_vector(query)
# kwargs = [{'c_dir':x, 'threshold':threshold, 'query': query} for x in c_dirs]
args = [(x, threshold, query, None, logger, redis_key) for x in c_dirs]
logger.info(f"CALCULATING FOR {args}")
with Pool(processes=8) as pool:
out = pool.starmap(calculate_embedding_score_in_folder, args)
logger.info(f"DONE CALCULATING FOR {args}")
for x in out:
try:
result_list.extend(x['videos'])
except Exception as e:
print(e, x)
return {'videos':result_list}
def collapse_scores_to_maxmin_avg(folder_scores):
result = list()
for c_data in folder_scores['videos']:
new_d = c_data.copy()
scores = new_d['embed_scores']['score']
max_score = max(scores)
min_score = min(scores)
max_score_idx = scores.index(max_score)
min_score_idx = scores.index(min_score)
max_score_time = new_d['embed_scores']['time'][max_score_idx]
min_score_time = new_d['embed_scores']['time'][min_score_idx]
new_d['embed_scores']['score'] = [min_score, max_score, max_score_time, min_score_time]
new_d['embed_scores']['time'] = max(new_d['embed_scores']['time'])
result.append(new_d)
return result
# c_data = {'file_name': str(s_file), 'start_time':start_time, 'end_time':end_time, 'embed_scores':{'time':frame_time, 'score':embed_scores}}
# video_json_info.append(c_data)
# to_write = {'source_files': vec_rep['source_files'], 'videos': video_json_info}
# with open(cache_file_loc, 'wb') as f:
# logger.info(f"WRITING EMBEDDING SCORE TO CACHE {cache_file_loc}")
# pickle.dump(to_write, f)
def calculate_embedding_score_in_folder(og_dir, threshold, query = None, query_vector = None, logger = logger, redis_key = 'compute_log'):
message = {'task':'SCORE_CALC_IN_FOLDER_START', 'when': str(og_dir), 'time': dt.datetime.now().timestamp()}
r.rpush(redis_key, json.dumps(message))
if query_vector is None:
query_vector = get_query_vector(query)
candidate_dirs = list()
candidate_dirs.append(og_dir)
candidate_dirs.append(og_dir.replace('/srv/ftp_tcc','/mnt/hdd_24tb_1/videos/ftp'))
candidate_dirs.append(og_dir.replace('/srv/ftp','/mnt/hdd_24tb_1/videos/ftp'))
c_dir = None
for candidate in candidate_dirs:
if os.path.exists(candidate):
c_dir = candidate
break
if c_dir is None:
return []
vec_cache_str = md5(query_vector).hexdigest()
cache_file_loc = os.path.join(c_dir, 'embedding_scores@'+str(threshold)+'@'+vec_cache_str+'.pkl')
if os.path.exists(cache_file_loc):
logger.info(f"TRYING TO LOAD CACHE {cache_file_loc}")
try:
with open(cache_file_loc, 'rb') as f:
video_json_info = pickle.load(f)
files_in_cache = {os.path.splitext(os.path.basename(x))[0] for x in video_json_info.get('source_files',[])}
lsd_dir = os.listdir(c_dir)
files_on_disk = {x.split(".")[0] for x in lsd_dir if x.endswith('oclip_embeds.npz')}
if files_on_disk == files_in_cache:
logger.info(f"LOADED EMBEDDING SCORE FROM CACHE {cache_file_loc}")
message = {'task':'SCORE_CALC_IN_FOLDER_DONE', 'when': str(c_dir), 'time': dt.datetime.now().timestamp(), 'precomputed': True}
r.rpush(redis_key, json.dumps(message))
return video_json_info
else:
logger.info(f"CACHE FILE IS OLD, DELETING VEC REP FILE AND RECREATING {cache_file_loc}")
os.remove( get_vec_rep_file_loc(c_dir))
except Exception as e:
logger.info(f"CACHE FILE IS CORRUPT, RECREATING {cache_file_loc} {e}")
os.remove(cache_file_loc)
pass
vec_rep = get_vector_representation(c_dir, redis_key = redis_key)
query_scores = get_scores_embedding_c_dir(c_dir, tuple(query_vector.tolist()[0]), redis_key = redis_key)
video_json_info = list()
idces_keep = np.where(query_scores > threshold)[0]
video_id = vec_rep['idces'][idces_keep]
videos_that_match = np.unique(video_id)
# subset_timestampsF = list()
# for s in videos_that_match:
# idces_entry = np.where(vec_rep['idces'] == s)[0]
# min_idces = idces_entry[0]
# max_idces = idces_entry[-1]
# subset_timestampsF.append( [ vec_rep['timestamps'][min_idces], vec_rep['timestamps'][max_idces]])
id_extract_video_level = np.where(np.isin(vec_rep['idces'], videos_that_match))[0]
idces_split = np.where(np.diff(vec_rep['idces'][id_extract_video_level]) !=0)[0] + 1
subset_timestampsF = np.split(vec_rep['timestamps'][id_extract_video_level], idces_split)
for subset_t in subset_timestampsF:
if len(subset_t) == 0:
continue
min_t = min(subset_t)
max_t = max(subset_t)
idces_curr = np.where(np.logical_and(vec_rep['timestamps'] > min_t , vec_rep['timestamps'] < max_t))[0]
if len(idces_curr) == 0:
continue
unq_vids = np.unique(vec_rep['idces'][idces_curr])
subset_idx = np.where(np.isin(vec_rep['idces'],unq_vids))[0]
subset_idces = vec_rep['idces'][subset_idx]
subset_timestamps = vec_rep['timestamps'][subset_idx]
subset_scores = query_scores[subset_idx]
idx_split = np.where(np.diff(vec_rep['idces'][subset_idx]) !=0)[0]+1
split_idces = np.split(subset_idces, idx_split)
split_timestamps = np.split(subset_timestamps, idx_split)
split_scores = np.split(subset_scores, idx_split)
split_files = [vec_rep['source_files'][x[0]] for x in split_idces]
for s_file, s_scores, s_tstamps, s_idces in zip(split_files, split_scores, split_timestamps, split_idces):
start_time = float(min(s_tstamps))
end_time = float(max(s_tstamps))
frame_time = (s_tstamps - start_time).tolist()
embed_scores = s_scores.tolist()
c_data = {'file_name': str(s_file), 'start_time':start_time, 'end_time':end_time, 'embed_scores':{'time':frame_time, 'score':embed_scores}}
video_json_info.append(c_data)
message = {'task':'SCORE_CALC_IN_FOLDER_DONE', 'when': str(c_dir), 'time': dt.datetime.now().timestamp()}
r.rpush(redis_key, json.dumps(message))
to_write = {'source_files': vec_rep['source_files'], 'videos': video_json_info}
with open(cache_file_loc, 'wb') as f:
logger.info(f"WRITING EMBEDDING SCORE TO CACHE {cache_file_loc}")
pickle.dump(to_write, f)
return to_write
def get_matching_file_given_filename(web_name, folder_scores):
file_name = None
for x in folder_scores['videos']:
if x['file_name'].endswith(web_name):
file_name = x['file_name']
candidate_files = list()
candidate_files.append(file_name)
candidate_files.append(file_name.replace('/srv/ftp_tcc','/mnt/hdd_24tb_1/videos/ftp'))
candidate_files.append(file_name.replace('/srv/ftp','/mnt/hdd_24tb_1/videos/ftp'))
file_name = None
for candidate in candidate_files:
if os.path.exists(candidate):
file_name = candidate
break
return file_name
#c_dirs = ['/mnt/hdd_24tb_1/videos/ftp/leopards2/2025/08/26','/srv/ftp_tcc/leopards1/2025/08/27','/srv/ftp_tcc/leopards1/2025/08/28','/srv/ftp_tcc/leopards1/2025/08/29']
#op = calculate_embedding_score_in_folders( tuple(c_dirs), 0.10, query = 'A cat and human')
def add_breaks_between_videos(op, threshold_to_split_seconds = 30*60): # 30 minutes):
ranges = list()
for vids in op['videos']:
ranges.append( (vids['start_time'], vids['end_time']) )
breaks = list()
for idx in range(len(ranges)-1):
current_range = ranges[idx]
next_range = ranges[idx+1]
end_now = current_range[1]
start_next = next_range[0]
if (start_next - end_now) > threshold_to_split_seconds:
breaks.append((end_now, start_next))
return breaks

View File

@@ -0,0 +1,96 @@
from typing import Union, Optional, List
from pydantic import BaseModel
from fastapi import FastAPI, Request, Depends
from CommonCode.settings import get_logger
import logging
from fastapi.responses import StreamingResponse
import os
import sys
import json
import time
from util import embed_scores as ES
from fastapi_server_session import SessionManager, RedisSessionInterface, Session
import redis
from datetime import timedelta
app = FastAPI()
session_manager = SessionManager(
interface=RedisSessionInterface(redis.from_url("redis://localhost"))
)
logger = get_logger(__name__,'/var/log/vector_search_logs/main_embed_scores', stdout=True, systemd=False, level = logging.INFO)
r = redis.Redis(host='localhost', port=6379, db=15)
class VideosPostRequest(BaseModel):
query: str = "A cat and a human",
threshold: float = 0.10,
c_dirs: Optional[List[str]] = None,
task_id: str = 'compute_log'
@app.post("/videos.json")
async def videos_json(
vpr: VideosPostRequest,
session: Session = Depends(session_manager.use_session),
):
query = vpr.query
threshold = vpr.threshold
c_dirs = vpr.c_dirs
task_id = vpr.task_id
if c_dirs is None:
c_dirs = [
# "/mnt/hdd_24tb_1/videos/ftp/leopards2/2025/08/26",
# "/srv/ftp_tcc/leopards1/2025/08/27",
# "/srv/ftp_tcc/leopards1/2025/08/28",
# "/srv/ftp_tcc/leopards1/2025/08/29",
# "/srv/ftp_tcc/leopards1/2025/08/30",
# "/srv/ftp_tcc/leopards1/2025/08/31",
# "/srv/ftp_tcc/leopards1/2025/09/01",
# "/srv/ftp_tcc/leopards1/2025/09/02",
# "/srv/ftp_tcc/leopards1/2025/09/03",
# "/srv/ftp_tcc/leopards1/2025/09/04",
# "/srv/ftp_tcc/leopards1/2025/09/05",
# "/srv/ftp_tcc/leopards1/2025/09/06",
# "/srv/ftp_tcc/leopards1/2025/09/07",
"/srv/ftp_tcc/leopards1/2025/09/08",
"/srv/ftp_tcc/leopards1/2025/09/09",
"/srv/ftp_tcc/leopards1/2025/09/10",
"/srv/ftp_tcc/leopards1/2025/09/11",
]
print(','.join([str(x) for x in c_dirs]))
message = {'task':'SCHEDULED','when':[str(x) for x in c_dirs], 'time':time.time()}
r.rpush(task_id, json.dumps(message))
for x in c_dirs:
message = {'task':'QUEUEING', 'when': str(x), 'time': time.time()}
r.rpush(task_id, json.dumps(message))
folder_scores = ES.calculate_embedding_score_in_folders(
tuple(c_dirs), threshold=threshold, query=query, redis_key = task_id)
# if p_hits != ES.calculate_embedding_score_in_folders.cache_info().hits:
# logger.info("FROM CACHE")
# else:pp
# logger.info("COMPUTED FROM SCRATCH")
folder_scores["breaks"] = ES.add_breaks_between_videos(folder_scores)
folder_scores['videos'] = ES.collapse_scores_to_maxmin_avg(folder_scores)
session["folder_scores"] = folder_scores
return folder_scores
class ClickEvent(BaseModel):
timestamp: float
class ClickResponse(BaseModel):
path: str
timeoffset: float