rag/rag_fr_embedding.ipynb

121 lines
6.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "98de82f6-2dc9-4d27-a5d8-d07ae04b496c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n"
]
}
],
"source": [
"from embedding import EmbeddingModel"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "37408a48-ce90-4176-bc9f-b71ebc22a178",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-01-03 11:13:53,279 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large\n",
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n",
"2024-01-03 11:13:56,891 - INFO - Use pytorch device: cpu\n",
"2024-01-03 11:13:56,894 - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
"2024-01-03 11:13:56,990 - INFO - 4a06529f5f.txt : Start\n",
"Batches: 0%| | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.64it/s]\n",
"2024-01-03 11:13:57,660 - INFO - 4a06529f5f.txt : Done\n",
"2024-01-03 11:13:57,660 - INFO - 4aac6081e0.txt : Start\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.93it/s]\n",
"2024-01-03 11:13:58,189 - INFO - 4aac6081e0.txt : Done\n",
"2024-01-03 11:13:58,189 - INFO - 4a5736d002.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 5.89it/s]\n",
"2024-01-03 11:13:58,365 - INFO - 4a5736d002.txt : Done\n",
"2024-01-03 11:13:58,366 - INFO - 3d159cbe89.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.63it/s]\n",
"2024-01-03 11:13:58,988 - INFO - 3d159cbe89.txt : Done\n",
"2024-01-03 11:13:58,989 - INFO - 3f3e46760c.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 6.07it/s]\n",
"2024-01-03 11:13:59,159 - INFO - 3f3e46760c.txt : Done\n",
"2024-01-03 11:13:59,160 - INFO - 3ced86d1db.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 2.12it/s]\n",
"2024-01-03 11:13:59,640 - INFO - 3ced86d1db.txt : Done\n",
"2024-01-03 11:13:59,641 - INFO - 3bbe30b18a.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:01<00:00, 1.46s/it]\n",
"2024-01-03 11:14:01,116 - INFO - 3bbe30b18a.txt : Done\n",
"2024-01-03 11:14:01,116 - INFO - 3dbfdeb28e.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:01<00:00, 1.17s/it]\n",
"2024-01-03 11:14:02,299 - INFO - 3dbfdeb28e.txt : Done\n",
"2024-01-03 11:14:02,299 - INFO - 4adf02d48f.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1.71it/s]\n",
"2024-01-03 11:14:02,895 - INFO - 4adf02d48f.txt : Done\n",
"2024-01-03 11:14:02,896 - INFO - 3c25273538.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:02<00:00, 2.02s/it]\n",
"2024-01-03 11:14:04,940 - INFO - 3c25273538.txt : Done\n",
"2024-01-03 11:14:04,940 - INFO - 4aeb967bdb.txt : Start\n",
"Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 2.00it/s]\n",
"2024-01-03 11:14:05,449 - INFO - 4aeb967bdb.txt : Done\n"
]
}
],
"source": [
"model_name = 'intfloat/multilingual-e5-large'\n",
"chromadb_path = './chromadbtest'\n",
"folder_path = './docs/test'\n",
"collection_name = 'cera'\n",
"\n",
"embedding_model = EmbeddingModel(model_name, chromadb_path, collection_name)\n",
"embedding_model.embed_folder(folder_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2acd9c49-5676-4e72-9eff-f6fb8ffa94fe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "RAG_ENV",
"language": "python",
"name": "rag_env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}