Files
vector_search/milvus_migrate/upload_from_folder.py
2025-04-17 15:55:54 -04:00

76 lines
1.9 KiB
Python

from pymilvus import MilvusClient, DataType
import numpy as np
import time
from pymilvus.client.types import LoadState
client = MilvusClient(
uri="http://localhost:19530"
)
res = client.get_load_state(
collection_name="nuggets_so400m"
)
if res['state'] == LoadState.Loaded:
pass
else:
client.load_collection(collection_name = 'nuggets_so400m')
for i in range(10):
time.sleep(1)
if res['state'] == LoadState.Loaded:
break
def get_vec_path(vpath):
return os.path.splitext(vpath)[0]+'.oclip_embeds.npz'
def get_db_embed_done_path(vpath):
return os.path.splitext(vpath)[0]+'.db_has_oclip_embeds'
def upload_vector_file(vector_file_to_upload):
if os.path.exists(get_embed_done_path(vector_file_to_upload)):
print('Already exists in DB, skipping upload')
return
vector_file_to_upload = get_vec_path(vector_file_to_upload)
vf = np.load(vector_file_to_upload)
embeds = vf['embeds']
fr_nums = vf['frame_numbers']
fname_root = vector_file_to_upload.rsplit('/',1)[-1].split('.')[0]
fc = fname_root.split('_')[-1]
data = list()
filepath = vector_file_to_upload.replace('/srv/ftp/','').replace('/mergedfs/ftp','').split('.')[-0]
for embed, frame_num in zip(embeds, fr_nums):
fg = '{0:05g}'.format(frame_num)
id_num = int(fc+fg)
to_put = dict(primary_id= id_num, filepath=filepath, frame_number = int(frame_num), so400m=embed)
data.append(to_put)
client.insert(collection_name = 'nuggets_so400m', data = data)
print(f'Inserting into DB, {vector_file_to_upload}')
with open(get_embed_done_path(vector_file_to_upload),'w') as ff:
ff.write(str(time.time()))
root_path = '/srv/ftp/railing/2024'
to_put = list()
for root, dirs, files in os.walk(root_path):
for x in files:
if x.endswith('oclip_embeds.npz'):
to_put.append(os.path.join(root, x))
for x in to_put:
upload_vector_file(x)