121 lines
6.4 KiB
Plaintext
121 lines
6.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "98de82f6-2dc9-4d27-a5d8-d07ae04b496c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n",
|
|
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
|
|
" _torch_pytree._register_pytree_node(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from embedding import EmbeddingModel"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "37408a48-ce90-4176-bc9f-b71ebc22a178",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2024-01-03 11:13:53,279 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large\n",
|
|
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
|
|
" _torch_pytree._register_pytree_node(\n",
|
|
"2024-01-03 11:13:56,891 - INFO - Use pytorch device: cpu\n",
|
|
"2024-01-03 11:13:56,894 - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
|
|
"2024-01-03 11:13:56,990 - INFO - 4a06529f5f.txt : Start\n",
|
|
"Batches: 0%| | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
|
"To disable this warning, you can either:\n",
|
|
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
|
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.64it/s]\n",
|
|
"2024-01-03 11:13:57,660 - INFO - 4a06529f5f.txt : Done\n",
|
|
"2024-01-03 11:13:57,660 - INFO - 4aac6081e0.txt : Start\n",
|
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.93it/s]\n",
|
|
"2024-01-03 11:13:58,189 - INFO - 4aac6081e0.txt : Done\n",
|
|
"2024-01-03 11:13:58,189 - INFO - 4a5736d002.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 5.89it/s]\n",
|
|
"2024-01-03 11:13:58,365 - INFO - 4a5736d002.txt : Done\n",
|
|
"2024-01-03 11:13:58,366 - INFO - 3d159cbe89.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.63it/s]\n",
|
|
"2024-01-03 11:13:58,988 - INFO - 3d159cbe89.txt : Done\n",
|
|
"2024-01-03 11:13:58,989 - INFO - 3f3e46760c.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 6.07it/s]\n",
|
|
"2024-01-03 11:13:59,159 - INFO - 3f3e46760c.txt : Done\n",
|
|
"2024-01-03 11:13:59,160 - INFO - 3ced86d1db.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 2.12it/s]\n",
|
|
"2024-01-03 11:13:59,640 - INFO - 3ced86d1db.txt : Done\n",
|
|
"2024-01-03 11:13:59,641 - INFO - 3bbe30b18a.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:01<00:00, 1.46s/it]\n",
|
|
"2024-01-03 11:14:01,116 - INFO - 3bbe30b18a.txt : Done\n",
|
|
"2024-01-03 11:14:01,116 - INFO - 3dbfdeb28e.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:01<00:00, 1.17s/it]\n",
|
|
"2024-01-03 11:14:02,299 - INFO - 3dbfdeb28e.txt : Done\n",
|
|
"2024-01-03 11:14:02,299 - INFO - 4adf02d48f.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.71it/s]\n",
|
|
"2024-01-03 11:14:02,895 - INFO - 4adf02d48f.txt : Done\n",
|
|
"2024-01-03 11:14:02,896 - INFO - 3c25273538.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:02<00:00, 2.02s/it]\n",
|
|
"2024-01-03 11:14:04,940 - INFO - 3c25273538.txt : Done\n",
|
|
"2024-01-03 11:14:04,940 - INFO - 4aeb967bdb.txt : Start\n",
|
|
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 2.00it/s]\n",
|
|
"2024-01-03 11:14:05,449 - INFO - 4aeb967bdb.txt : Done\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model_name = 'intfloat/multilingual-e5-large'\n",
|
|
"chromadb_path = './chromadbtest'\n",
|
|
"folder_path = './docs/test'\n",
|
|
"collection_name = 'cera'\n",
|
|
"\n",
|
|
"embedding_model = EmbeddingModel(model_name, chromadb_path, collection_name)\n",
|
|
"embedding_model.embed_folder(folder_path)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2acd9c49-5676-4e72-9eff-f6fb8ffa94fe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "RAG_ENV",
|
|
"language": "python",
|
|
"name": "rag_env"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.18"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|