commit 98f6a7b3c0c54ae287c419cb2895a4f14847a5ed Author: Matthieu Date: Mon Nov 3 18:20:12 2025 +0100 Initial commit diff --git a/.coverage b/.coverage new file mode 100644 index 0000000..48c3279 Binary files /dev/null and b/.coverage differ diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000..86f239b Binary files /dev/null and b/__pycache__/main.cpython-312.pyc differ diff --git a/data/sample-lancedb/rag-table.lance/_latest.manifest b/data/sample-lancedb/rag-table.lance/_latest.manifest new file mode 100644 index 0000000..3fd2b70 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_latest.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/0-8296f8ca-f94c-4570-a66a-d3dfbdac8cba.txn b/data/sample-lancedb/rag-table.lance/_transactions/0-8296f8ca-f94c-4570-a66a-d3dfbdac8cba.txn new file mode 100644 index 0000000..a7d00ce --- /dev/null +++ b/data/sample-lancedb/rag-table.lance/_transactions/0-8296f8ca-f94c-4570-a66a-d3dfbdac8cba.txn @@ -0,0 +1 @@ +$8296f8ca-f94c-4570-a66a-d3dfbdac8cba˛{2vector ˙˙˙˙˙˙˙˙˙*fixed_size_list:float:38408"content ˙˙˙˙˙˙˙˙˙*string08!source ˙˙˙˙˙˙˙˙˙*string08 \ No newline at end of file diff --git a/data/sample-lancedb/rag-table.lance/_transactions/1-41d784f9-21e6-4b14-8d1f-cafce7c09cb7.txn b/data/sample-lancedb/rag-table.lance/_transactions/1-41d784f9-21e6-4b14-8d1f-cafce7c09cb7.txn new file mode 100644 index 0000000..a4e6267 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/1-41d784f9-21e6-4b14-8d1f-cafce7c09cb7.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/10-47fb13d9-9bce-4bfc-95ed-17e2fc82e823.txn b/data/sample-lancedb/rag-table.lance/_transactions/10-47fb13d9-9bce-4bfc-95ed-17e2fc82e823.txn new file mode 100644 index 0000000..827bd23 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/10-47fb13d9-9bce-4bfc-95ed-17e2fc82e823.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/11-b1dbd882-5a16-4809-b588-b066f107cd09.txn b/data/sample-lancedb/rag-table.lance/_transactions/11-b1dbd882-5a16-4809-b588-b066f107cd09.txn new file mode 100644 index 0000000..d26dcb9 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/11-b1dbd882-5a16-4809-b588-b066f107cd09.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/12-304e8d93-0789-44f2-9fcb-54fad6833db4.txn b/data/sample-lancedb/rag-table.lance/_transactions/12-304e8d93-0789-44f2-9fcb-54fad6833db4.txn new file mode 100644 index 0000000..47e97cc Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/12-304e8d93-0789-44f2-9fcb-54fad6833db4.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/13-f5e3ddc0-37a1-428c-aecb-44b61c4b5019.txn b/data/sample-lancedb/rag-table.lance/_transactions/13-f5e3ddc0-37a1-428c-aecb-44b61c4b5019.txn new file mode 100644 index 0000000..e4424a1 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/13-f5e3ddc0-37a1-428c-aecb-44b61c4b5019.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/14-681c366d-6882-42c4-a80a-803af07cb669.txn b/data/sample-lancedb/rag-table.lance/_transactions/14-681c366d-6882-42c4-a80a-803af07cb669.txn new file mode 100644 index 0000000..c9936f0 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/14-681c366d-6882-42c4-a80a-803af07cb669.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/15-e96db896-8792-466a-a433-e3de32d8fc59.txn b/data/sample-lancedb/rag-table.lance/_transactions/15-e96db896-8792-466a-a433-e3de32d8fc59.txn new file mode 100644 index 0000000..88c97bd Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/15-e96db896-8792-466a-a433-e3de32d8fc59.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/16-bcd68c97-939e-46bb-856c-402f90cae659.txn b/data/sample-lancedb/rag-table.lance/_transactions/16-bcd68c97-939e-46bb-856c-402f90cae659.txn new file mode 100644 index 0000000..a094d0c Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/16-bcd68c97-939e-46bb-856c-402f90cae659.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/17-d5db02c4-92d6-4fb2-912a-c7ea8aeeab42.txn b/data/sample-lancedb/rag-table.lance/_transactions/17-d5db02c4-92d6-4fb2-912a-c7ea8aeeab42.txn new file mode 100644 index 0000000..a42f693 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/17-d5db02c4-92d6-4fb2-912a-c7ea8aeeab42.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/18-06b0341a-bf2a-4e6a-8a87-0d751482a19c.txn b/data/sample-lancedb/rag-table.lance/_transactions/18-06b0341a-bf2a-4e6a-8a87-0d751482a19c.txn new file mode 100644 index 0000000..afdaaad Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/18-06b0341a-bf2a-4e6a-8a87-0d751482a19c.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/19-0c5ebb01-2100-47da-99b0-0e19da41a3ea.txn b/data/sample-lancedb/rag-table.lance/_transactions/19-0c5ebb01-2100-47da-99b0-0e19da41a3ea.txn new file mode 100644 index 0000000..510fcf8 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/19-0c5ebb01-2100-47da-99b0-0e19da41a3ea.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/2-d6c54a53-8bfc-40cb-a016-2d79d9d0d426.txn b/data/sample-lancedb/rag-table.lance/_transactions/2-d6c54a53-8bfc-40cb-a016-2d79d9d0d426.txn new file mode 100644 index 0000000..8faa6db Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/2-d6c54a53-8bfc-40cb-a016-2d79d9d0d426.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/20-181a3af1-47b4-4df7-b5f1-d322d89e664c.txn b/data/sample-lancedb/rag-table.lance/_transactions/20-181a3af1-47b4-4df7-b5f1-d322d89e664c.txn new file mode 100644 index 0000000..d69e5ba Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/20-181a3af1-47b4-4df7-b5f1-d322d89e664c.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/21-63fe8d6c-c3f5-4951-b1c2-0b9c8d06ce8d.txn b/data/sample-lancedb/rag-table.lance/_transactions/21-63fe8d6c-c3f5-4951-b1c2-0b9c8d06ce8d.txn new file mode 100644 index 0000000..f7a9b1c Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/21-63fe8d6c-c3f5-4951-b1c2-0b9c8d06ce8d.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/22-5f4f05d3-419e-4b8a-b9b5-228a533795f6.txn b/data/sample-lancedb/rag-table.lance/_transactions/22-5f4f05d3-419e-4b8a-b9b5-228a533795f6.txn new file mode 100644 index 0000000..353fb43 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/22-5f4f05d3-419e-4b8a-b9b5-228a533795f6.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/3-d89728ee-ce26-42df-8e6d-6194b2c4d755.txn b/data/sample-lancedb/rag-table.lance/_transactions/3-d89728ee-ce26-42df-8e6d-6194b2c4d755.txn new file mode 100644 index 0000000..1701ea3 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/3-d89728ee-ce26-42df-8e6d-6194b2c4d755.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/4-84c16278-326b-4231-8fcb-429a1d29faa4.txn b/data/sample-lancedb/rag-table.lance/_transactions/4-84c16278-326b-4231-8fcb-429a1d29faa4.txn new file mode 100644 index 0000000..8cd4589 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/4-84c16278-326b-4231-8fcb-429a1d29faa4.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/5-4d8980cb-81ea-439e-a842-083ab3e2097f.txn b/data/sample-lancedb/rag-table.lance/_transactions/5-4d8980cb-81ea-439e-a842-083ab3e2097f.txn new file mode 100644 index 0000000..e6f05e3 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/5-4d8980cb-81ea-439e-a842-083ab3e2097f.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/6-ed3baef7-951a-44c1-a93b-205fc9c97ea9.txn b/data/sample-lancedb/rag-table.lance/_transactions/6-ed3baef7-951a-44c1-a93b-205fc9c97ea9.txn new file mode 100644 index 0000000..63971ed Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/6-ed3baef7-951a-44c1-a93b-205fc9c97ea9.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/7-bc4c52b5-15aa-422d-a16c-66d1220fbe26.txn b/data/sample-lancedb/rag-table.lance/_transactions/7-bc4c52b5-15aa-422d-a16c-66d1220fbe26.txn new file mode 100644 index 0000000..1a1aa75 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/7-bc4c52b5-15aa-422d-a16c-66d1220fbe26.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/8-2b6021bf-c449-4dce-afde-74b83013f0ca.txn b/data/sample-lancedb/rag-table.lance/_transactions/8-2b6021bf-c449-4dce-afde-74b83013f0ca.txn new file mode 100644 index 0000000..c38de5e Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/8-2b6021bf-c449-4dce-afde-74b83013f0ca.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_transactions/9-a3692c6d-8d44-443b-aba5-4c6704b101a2.txn b/data/sample-lancedb/rag-table.lance/_transactions/9-a3692c6d-8d44-443b-aba5-4c6704b101a2.txn new file mode 100644 index 0000000..5ba5429 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_transactions/9-a3692c6d-8d44-443b-aba5-4c6704b101a2.txn differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/1.manifest b/data/sample-lancedb/rag-table.lance/_versions/1.manifest new file mode 100644 index 0000000..ae0edb6 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/1.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/10.manifest b/data/sample-lancedb/rag-table.lance/_versions/10.manifest new file mode 100644 index 0000000..dd35d55 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/10.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/11.manifest b/data/sample-lancedb/rag-table.lance/_versions/11.manifest new file mode 100644 index 0000000..9ba982c Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/11.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/12.manifest b/data/sample-lancedb/rag-table.lance/_versions/12.manifest new file mode 100644 index 0000000..2dea9f6 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/12.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/13.manifest b/data/sample-lancedb/rag-table.lance/_versions/13.manifest new file mode 100644 index 0000000..3c5bbcb Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/13.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/14.manifest b/data/sample-lancedb/rag-table.lance/_versions/14.manifest new file mode 100644 index 0000000..76a90b8 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/14.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/15.manifest b/data/sample-lancedb/rag-table.lance/_versions/15.manifest new file mode 100644 index 0000000..4f02a1e Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/15.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/16.manifest b/data/sample-lancedb/rag-table.lance/_versions/16.manifest new file mode 100644 index 0000000..2f89eed Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/16.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/17.manifest b/data/sample-lancedb/rag-table.lance/_versions/17.manifest new file mode 100644 index 0000000..709d442 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/17.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/18.manifest b/data/sample-lancedb/rag-table.lance/_versions/18.manifest new file mode 100644 index 0000000..8dc5cde Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/18.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/19.manifest b/data/sample-lancedb/rag-table.lance/_versions/19.manifest new file mode 100644 index 0000000..2be701d Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/19.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/2.manifest b/data/sample-lancedb/rag-table.lance/_versions/2.manifest new file mode 100644 index 0000000..3f1a206 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/2.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/20.manifest b/data/sample-lancedb/rag-table.lance/_versions/20.manifest new file mode 100644 index 0000000..4909763 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/20.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/21.manifest b/data/sample-lancedb/rag-table.lance/_versions/21.manifest new file mode 100644 index 0000000..4caf021 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/21.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/22.manifest b/data/sample-lancedb/rag-table.lance/_versions/22.manifest new file mode 100644 index 0000000..a723d49 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/22.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/23.manifest b/data/sample-lancedb/rag-table.lance/_versions/23.manifest new file mode 100644 index 0000000..3fd2b70 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/23.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/3.manifest b/data/sample-lancedb/rag-table.lance/_versions/3.manifest new file mode 100644 index 0000000..2f90443 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/3.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/4.manifest b/data/sample-lancedb/rag-table.lance/_versions/4.manifest new file mode 100644 index 0000000..50196a7 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/4.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/5.manifest b/data/sample-lancedb/rag-table.lance/_versions/5.manifest new file mode 100644 index 0000000..ce9dd1b Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/5.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/6.manifest b/data/sample-lancedb/rag-table.lance/_versions/6.manifest new file mode 100644 index 0000000..fb1ee20 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/6.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/7.manifest b/data/sample-lancedb/rag-table.lance/_versions/7.manifest new file mode 100644 index 0000000..b018d70 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/7.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/8.manifest b/data/sample-lancedb/rag-table.lance/_versions/8.manifest new file mode 100644 index 0000000..8764b36 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/8.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/_versions/9.manifest b/data/sample-lancedb/rag-table.lance/_versions/9.manifest new file mode 100644 index 0000000..118f007 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/_versions/9.manifest differ diff --git a/data/sample-lancedb/rag-table.lance/data/00b0b1bb-3e49-4d56-b7c3-83f2ba18e185.lance b/data/sample-lancedb/rag-table.lance/data/00b0b1bb-3e49-4d56-b7c3-83f2ba18e185.lance new file mode 100644 index 0000000..02d4318 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/00b0b1bb-3e49-4d56-b7c3-83f2ba18e185.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/0542ada5-4974-4fe7-927b-722e39c216d2.lance b/data/sample-lancedb/rag-table.lance/data/0542ada5-4974-4fe7-927b-722e39c216d2.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/0542ada5-4974-4fe7-927b-722e39c216d2.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/109f608a-8c6c-4778-9bea-016d0694e049.lance b/data/sample-lancedb/rag-table.lance/data/109f608a-8c6c-4778-9bea-016d0694e049.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/109f608a-8c6c-4778-9bea-016d0694e049.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/12a36624-5e9b-49e5-be55-da7f8dcf6b4e.lance b/data/sample-lancedb/rag-table.lance/data/12a36624-5e9b-49e5-be55-da7f8dcf6b4e.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/12a36624-5e9b-49e5-be55-da7f8dcf6b4e.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/42bdd03f-992b-4adc-9268-cd30964c0527.lance b/data/sample-lancedb/rag-table.lance/data/42bdd03f-992b-4adc-9268-cd30964c0527.lance new file mode 100644 index 0000000..22959a0 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/42bdd03f-992b-4adc-9268-cd30964c0527.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/59964470-be63-4b12-b667-111f8dd00e4b.lance b/data/sample-lancedb/rag-table.lance/data/59964470-be63-4b12-b667-111f8dd00e4b.lance new file mode 100644 index 0000000..1c0847f Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/59964470-be63-4b12-b667-111f8dd00e4b.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/6114ee58-342a-4c2a-abf3-b760e38a1f12.lance b/data/sample-lancedb/rag-table.lance/data/6114ee58-342a-4c2a-abf3-b760e38a1f12.lance new file mode 100644 index 0000000..22959a0 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/6114ee58-342a-4c2a-abf3-b760e38a1f12.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/764b1fc9-f400-4806-9105-d18d496fee4e.lance b/data/sample-lancedb/rag-table.lance/data/764b1fc9-f400-4806-9105-d18d496fee4e.lance new file mode 100644 index 0000000..02d4318 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/764b1fc9-f400-4806-9105-d18d496fee4e.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/793d0097-eaac-402a-bbbb-481f5d62bd76.lance b/data/sample-lancedb/rag-table.lance/data/793d0097-eaac-402a-bbbb-481f5d62bd76.lance new file mode 100644 index 0000000..02d4318 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/793d0097-eaac-402a-bbbb-481f5d62bd76.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/890b660c-853c-4eb7-a2a1-d88014384082.lance b/data/sample-lancedb/rag-table.lance/data/890b660c-853c-4eb7-a2a1-d88014384082.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/890b660c-853c-4eb7-a2a1-d88014384082.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/b75b11c0-4de0-4230-9216-6ff2edf2391f.lance b/data/sample-lancedb/rag-table.lance/data/b75b11c0-4de0-4230-9216-6ff2edf2391f.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/b75b11c0-4de0-4230-9216-6ff2edf2391f.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/c02db583-5102-480d-82f6-fc95226bfbfc.lance b/data/sample-lancedb/rag-table.lance/data/c02db583-5102-480d-82f6-fc95226bfbfc.lance new file mode 100644 index 0000000..02d4318 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/c02db583-5102-480d-82f6-fc95226bfbfc.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/d840eb58-d3e4-4ad0-9337-2f80918368cb.lance b/data/sample-lancedb/rag-table.lance/data/d840eb58-d3e4-4ad0-9337-2f80918368cb.lance new file mode 100644 index 0000000..8d2fa05 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/d840eb58-d3e4-4ad0-9337-2f80918368cb.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/d84fbed5-0d87-4e34-bd78-9ee4440e07fa.lance b/data/sample-lancedb/rag-table.lance/data/d84fbed5-0d87-4e34-bd78-9ee4440e07fa.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/d84fbed5-0d87-4e34-bd78-9ee4440e07fa.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/d93c721f-27b1-420c-bb12-4647e0bce00f.lance b/data/sample-lancedb/rag-table.lance/data/d93c721f-27b1-420c-bb12-4647e0bce00f.lance new file mode 100644 index 0000000..1c0847f Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/d93c721f-27b1-420c-bb12-4647e0bce00f.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/dd80b510-1145-4bb2-99f7-436c679cf7f9.lance b/data/sample-lancedb/rag-table.lance/data/dd80b510-1145-4bb2-99f7-436c679cf7f9.lance new file mode 100644 index 0000000..8d2fa05 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/dd80b510-1145-4bb2-99f7-436c679cf7f9.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/e80d6b03-f7f5-4e2d-b837-62cb2a674e2b.lance b/data/sample-lancedb/rag-table.lance/data/e80d6b03-f7f5-4e2d-b837-62cb2a674e2b.lance new file mode 100644 index 0000000..8d2fa05 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/e80d6b03-f7f5-4e2d-b837-62cb2a674e2b.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/e9eeedb1-073a-45a0-a0db-228ab9a0a28f.lance b/data/sample-lancedb/rag-table.lance/data/e9eeedb1-073a-45a0-a0db-228ab9a0a28f.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/e9eeedb1-073a-45a0-a0db-228ab9a0a28f.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/ec88cc97-a52a-4cf0-8e5b-279586dfe444.lance b/data/sample-lancedb/rag-table.lance/data/ec88cc97-a52a-4cf0-8e5b-279586dfe444.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/ec88cc97-a52a-4cf0-8e5b-279586dfe444.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/f0c04908-1a3a-43de-911a-46fcd21eb685.lance b/data/sample-lancedb/rag-table.lance/data/f0c04908-1a3a-43de-911a-46fcd21eb685.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/f0c04908-1a3a-43de-911a-46fcd21eb685.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/fb307ae5-f5b3-425b-952c-f8543139ab3d.lance b/data/sample-lancedb/rag-table.lance/data/fb307ae5-f5b3-425b-952c-f8543139ab3d.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/fb307ae5-f5b3-425b-952c-f8543139ab3d.lance differ diff --git a/data/sample-lancedb/rag-table.lance/data/fca257de-8ed3-49a0-b437-80e7d4655965.lance b/data/sample-lancedb/rag-table.lance/data/fca257de-8ed3-49a0-b437-80e7d4655965.lance new file mode 100644 index 0000000..2fcd0f2 Binary files /dev/null and b/data/sample-lancedb/rag-table.lance/data/fca257de-8ed3-49a0-b437-80e7d4655965.lance differ diff --git a/data/source/bilan_comptable_2024.pdf b/data/source/bilan_comptable_2024.pdf new file mode 100644 index 0000000..9ae35ae Binary files /dev/null and b/data/source/bilan_comptable_2024.pdf differ diff --git a/data/source/database.pdf b/data/source/database.pdf new file mode 100644 index 0000000..141948d Binary files /dev/null and b/data/source/database.pdf differ diff --git a/data/source/employes.pdf b/data/source/employes.pdf new file mode 100644 index 0000000..fa0f565 Binary files /dev/null and b/data/source/employes.pdf differ diff --git a/data/source/facture_14_03_2025.pdf b/data/source/facture_14_03_2025.pdf new file mode 100644 index 0000000..1973e8a Binary files /dev/null and b/data/source/facture_14_03_2025.pdf differ diff --git a/data/source/fournisseurs.pdf b/data/source/fournisseurs.pdf new file mode 100644 index 0000000..3ae5f1a Binary files /dev/null and b/data/source/fournisseurs.pdf differ diff --git a/data/source/historique_commandes.pdf b/data/source/historique_commandes.pdf new file mode 100644 index 0000000..6544f7e Binary files /dev/null and b/data/source/historique_commandes.pdf differ diff --git a/data/source/planning_production_mars_2025.pdf b/data/source/planning_production_mars_2025.pdf new file mode 100644 index 0000000..18ca74c Binary files /dev/null and b/data/source/planning_production_mars_2025.pdf differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..6737fc4 --- /dev/null +++ b/main.py @@ -0,0 +1,47 @@ +from src.impl.datastore import Datastore, DataItem +from src.impl.indexer import Indexer +from src.impl.retriever import Retriever +from src.impl.response_generator import ResponseGenerator +TEST_PATH = "data/source" +def main(): + query_graphiste = "Quel est le salaire brut mensuel du graphiste ?" + query_graphiste_en = "What is the monthly gross salary of the graphist designer ?" + print("Testing indexer") + indexer = Indexer() + items_from_indexer = indexer.index(["data/source/database.pdf", + "data/source/bilan_comptable_2024.pdf", + "data/source/employes.pdf", + "data/source/facture_14_03_2025.pdf", + "data/source/fournisseurs.pdf", + "data/source/historique_commandes.pdf", + "data/source/planning_production_mars_2025.pdf", + ]) + + + print("Testing datastore") + datastore = Datastore() + print(f"Model's maximum sequence length:{datastore.model.max_seq_length}") + test_vector = datastore.create_vector("test") + + data_item_to_test = DataItem( + content = "Data item being tested", + source = "from a test" + ) + + datastore.add_items([data_item_to_test]) + + datastore.add_items(items_from_indexer) + print(datastore.search_datastore(("Data item being tested"))) + print(datastore.search_datastore("Red t-shirt")) + + print("Testing retriever") + retriever = Retriever(datastore= datastore) + print(retriever.search_retriever(query_graphiste)) + + print("Testing Response generator") + #response_generator = ResponseGenerator() + #print(response_generator.generate_response(query_graphiste, retriever.search_retriever(query_graphiste))) + print("fin") + exit + +main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b4727b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +#executer pip install -r requirements.txt +numpy<2 +pandas>=2.1.4,<3.0 +scikit-learn +torch --index-url https://download.pytorch.org/whl/cpu +transformers +accelerate>=1.2.1,<2.0.0 +sentence-transformers +pyarrow==14.0.1 + +# Default Dependencies +pydantic>=2.0.0 # For data validation +lancedb==0.6.13 +docling==2.31.0 +cohere==5.15.0 + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__pycache__/__init__.cpython-312.pyc b/src/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..19c1e70 Binary files /dev/null and b/src/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/impl/__init__.py b/src/impl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/impl/__pycache__/__init__.cpython-312.pyc b/src/impl/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..3b07316 Binary files /dev/null and b/src/impl/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/impl/__pycache__/datastore.cpython-312.pyc b/src/impl/__pycache__/datastore.cpython-312.pyc new file mode 100644 index 0000000..91904ef Binary files /dev/null and b/src/impl/__pycache__/datastore.cpython-312.pyc differ diff --git a/src/impl/__pycache__/indexer.cpython-312.pyc b/src/impl/__pycache__/indexer.cpython-312.pyc new file mode 100644 index 0000000..054c60b Binary files /dev/null and b/src/impl/__pycache__/indexer.cpython-312.pyc differ diff --git a/src/impl/__pycache__/response_generator.cpython-312.pyc b/src/impl/__pycache__/response_generator.cpython-312.pyc new file mode 100644 index 0000000..c41d04a Binary files /dev/null and b/src/impl/__pycache__/response_generator.cpython-312.pyc differ diff --git a/src/impl/__pycache__/retriever.cpython-312.pyc b/src/impl/__pycache__/retriever.cpython-312.pyc new file mode 100644 index 0000000..a7a9373 Binary files /dev/null and b/src/impl/__pycache__/retriever.cpython-312.pyc differ diff --git a/src/impl/datastore.py b/src/impl/datastore.py new file mode 100644 index 0000000..3055b01 --- /dev/null +++ b/src/impl/datastore.py @@ -0,0 +1,163 @@ +from typing import List +from ..interface.base_datastore import BaseDatastore, DataItem +import lancedb +from lancedb.table import Table +from typing import List +import pyarrow as pa +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +# from concurrent.futures import ThreadPoolExecutor + + +class Datastore(BaseDatastore): + + DB_PATH = "data/sample-lancedb" + DB_TABLE_NAME = "rag-table" + + def __init__(self): + """Constructeur par dĂ©faut, initialise les dimensions des vecteurs pour l'embedding + (actuellement 384 par dĂ©faut pour le modèle all-MiniLm-L6-v2), charge le modèle SentenceTransformer, + connecte la base de donnĂ©es et rĂ©cupère la table. + #Model's maximum sequence length = 256 + """ + self.vector_dimensions = 384 # all-MiniLm-L6-v2 a une dimension fixe de 384 + self.model = SentenceTransformer("all-MiniLM-L6-v2") + self.vector_db = lancedb.connect(self.DB_PATH) + self.table: Table = self._get_table() + + + + def reset_table(self) -> Table: + """Drop la table si elle existe puis crĂ©e une table selon le schĂ©ma + vector (liste de float32 de dimension dĂ©finie dans la classe), content et source + et l'ouvre. + + Returns: + Table: La table crĂ©e + """ + try: + self.vector_db.drop_table(self.DB_TABLE_NAME) + except Exception as e: + print("Unable to drop the table, assuming it does not exist.") + + schema = pa.schema( + [ + pa.field("vector", pa.list_(pa.float32(),self.vector_dimensions)), + pa.field("content", pa.utf8()), + pa.field("source", pa.utf8()), + ] + ) + + self.table = self.vector_db.create_table(self.DB_TABLE_NAME, schema = schema) + #self.table = self.vector_db.open_table(self.DB_TABLE_NAME) + print(f"Table was reset/created: {self.DB_TABLE_NAME} in {self.DB_PATH}") + return self.table + + + def _get_table(self) -> Table: + """Ouvre la table ou la reset en cas d'Ă©chec + + Returns: + Table: la table ouverte + """ + try: + return self.vector_db.open_table(self.DB_TABLE_NAME) + except Exception as e: + print(f"Error opening the table {e}. Trying to reset it.") + return self.reset_table() + + def add_items(self, items: List[DataItem]) -> None: + """Ajoute les items en entrĂ©es dans le dataset (nĂ©cessite un embedding) + opĂ©ration network bound donc Ă  parallĂ©liser + Args: + items (List[DataItem]): Liste de DataItems Ă  ajouter + + Returns: + _type_: _description_ + """ + if not items: + return + + contents = [item.content for item in items] + sources = [item.source for item in items] + + "embedding du contenu de chaque entrĂ©e par batch de 32" + print(f"GĂ©nĂ©ration des embeddings pour {len(items)} items...") + vectors = self.model.encode( + contents, + batch_size= 32, + ) + + "conversion en dictionnaires pour stocker les documents dans la BDD" + entries = [ + { + "vector": vector, + "content": content, + "source": source + } + for vector, content, source in zip(vectors, contents, sources) + ] + + #self.table.merge_insert("source").when_matched_update_all().when_not_matched_insert_all().execute(entries) + self.table.add(entries) + print(f"{len(entries)} items ajoutĂ©s") + + #deprecated + def _convert_items_to_entry(self, item: DataItem) -> dict: + """Convertir un DataItem en dictionnaire correspondant au schĂ©ma du Datastore + + Args: + item (DataItem): item Ă  convertir + + Returns: + dict: Dictionnaire contenant le vecteur, le contenu et la source + """ + + vector = self.create_vector(item.content) + + return{ + "vector": vector, + "content": item.content, + "source": item.source, + } + + def create_vector(self, content: str) -> List[float]: + """Utilise le modèle d'embedding pour convertir le str en vecteur (list[float]) + + Args: + content (str): contenu de l'entrĂ©e Ă  vectoriser + + Returns: + List[float]: vecteur renvoyĂ© + """ + response = self.model.encode(content) + return response.tolist() + + def search_datastore(self, query: str, top_k: int = 5) -> List[str]: + """Embedde la query et lance une recherche + + Args: + query (str): requĂŞte + top_k (int, optional): Nombre de documents Ă  retourner au maximum. Defaults to 5. + + Returns: + List[str]: Tableau contenant le contenu des documents retrouvĂ©s + """ + + vector = self.model.encode(query) + results = ( + self.table + .search(vector, vector_column_name="vector") + .select(["content", "source"]) + .limit(top_k) + .to_list() + ) + + result_content = [ + result["content"] + for result in results + if "content" in result and result["content"] is not None] + return result_content + + diff --git a/src/impl/evaluator.py b/src/impl/evaluator.py new file mode 100644 index 0000000..f17bea4 --- /dev/null +++ b/src/impl/evaluator.py @@ -0,0 +1,45 @@ +from ..interface.base_evaluator import EvaluationResult, BaseEvaluator +import requests + +class Evaluator(BaseEvaluator): + + def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"): + self.base_url = base_url + self.model_name = model_name + + + SYSTEM_PROMPT = """ +You are a system that evaluates the correctness of a response to a question. +The question will be provided in ... tags. +The response will be provided in ... tags. +The expected answer will be provided in ... tags. + +The response doesn't have to exactly match all the words/context the expected answer. It just needs to be right about +the answer to the actual question itself. + +Evaluate whether the response is correct or not, and return your reasoning in ... tags. +Then return the result in ... tags — either as 'true' or 'false'. +""" + + def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult: + user_prompt = f""" + \n{query} + \n{response} + \n{expected_answer} + """ + + response_content = requests.post( + f"{self.base_url}/api/generate", + json={ + "model": self.model_name, + "prompt": user_prompt, + "stream": False, + "options": { + "temperature": 0.7, + "top_p": 0.9, + } + } + + return super().evaluate(query, response, expected_answer) + + \ No newline at end of file diff --git a/src/impl/indexer.py b/src/impl/indexer.py new file mode 100644 index 0000000..dc1c33a --- /dev/null +++ b/src/impl/indexer.py @@ -0,0 +1,101 @@ +from typing import List +from src.interface.base_datastore import DataItem +from src.interface.base_indexer import BaseIndexer +from docling.document_converter import DocumentConverter +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker +import os +from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer +from transformers import AutoTokenizer +from langchain_text_splitters import RecursiveCharacterTextSplitter +# pip install langchain langchain-text-splitters + + +EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" +MAX_TOKENS = 256 # set to a small number for illustrative purposes +#prĂ©cedemment 512 + +class Indexer(BaseIndexer): + + def __init__(self): + + self.converter = DocumentConverter() + self.tokenizer = HuggingFaceTokenizer( + tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID), + max_tokens = MAX_TOKENS, # optional, by default derived from `tokenizer` for HF case + ) + self.chunker = HybridChunker( + tokenizer = self.tokenizer, + max_tokens = MAX_TOKENS, + #merge_peers = True, + #handle_tables = "separate", + #handle_pictures = "separate" + ) + """self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( + tokenizer = self.tokenizer, + chunk_size = MAX_TOKENS, + chunk_overlap = MAX_TOKENS // 10, + separators=["\n\n", "\n", ". ", " ", ""], + )""" + + def index(self, document_paths: List[str]) -> List[DataItem]: + """Convertit les documents en format docling puis les dĂ©coupe en morceaux. + Les morceaux sont ensuite convertis sous forme de DataItem en y ajoutant des mĂ©tadonnĂ©es + + Args: + document_paths (List[str]): Liste des documents Ă  indexer + + Returns: + List[DataItem]: Liste des DataItems indexĂ©s + """ + items = [] + for document_path in document_paths: + try: + document = self.converter.convert(document_path).document + chunks = list(self.chunker.chunk(document)) + #chunks = self.text_splitter.split_text(document) + + item = self._convert_to_DataItem(chunks, document_path) + + items.extend(item) + + except Exception as e: + print(f"Erreur lors du traitement de {document_path}: {e}") + continue + + return items + + def _convert_to_DataItem(self, chunks, document_path: str) -> List[DataItem]: + """CrĂ©e une liste de DataItems en ajoutant les Ă©ventuels headers au dĂ©but du contenu + et en rĂ©cupĂ©rant la source Ă  l'aide du module os + + Args: + chunks : liste de chunks de documents Ă  traiter + document_path: chemin vers le document, permet d'utiliser os pour rĂ©cupĂ©rer directement le nom + + Returns: + Lits[DataItem]: liste de DataItem contenant les mĂ©tadonnĂ©es et sĂ©parant le contenu et la source + """ + items = [] + for i, chunk in enumerate(chunks): + + try: + headings = "" + + if(hasattr(chunk, 'meta') and hasattr(chunk.meta, 'headings') and chunk.meta.headings): + headings = "## " + ", ".join(chunk.meta.headings) + "\n" + + text = chunk.text if(hasattr(chunk, 'text')) else str(chunk) + + content = f"{headings}{text}" + + filename = os.path.basename(document_path) + source = f"{filename}:chunk {i}" + + item = DataItem(content = content, source = source) + items.append(item) + + except Exception as e: + print(f" Erreur sur le chunk {i}: {e}") + continue + + return items \ No newline at end of file diff --git a/src/impl/response_generator.py b/src/impl/response_generator.py new file mode 100644 index 0000000..204d06f --- /dev/null +++ b/src/impl/response_generator.py @@ -0,0 +1,83 @@ +from typing import List +from ..interface.base_response_generator import BaseResponseGenerator +import requests +import json + +SYSTEM_PROMPT = """Tu es un assistant intelligent qui rĂ©pond aux questions en te basant sur le contexte fourni. + +Règles importantes: +- RĂ©ponds UNIQUEMENT en te basant sur les informations du contexte +- Si l'information n'est pas dans le contexte, dis clairement "Je ne trouve pas cette information dans les documents fournis" +- Cite les sources quand c'est pertinent +- RĂ©ponds en français de manière claire et concise +- Ne rĂ©ponds pas avec "Selon le document" mais donne directement l'information""" + + +class ResponseGenerator(BaseResponseGenerator): + + def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"): + self.model_name = model_name + self.base_url = base_url + + def generate_response(self, query: str, context: List[str]) -> str: + """GĂ©nère une rĂ©ponse basĂ©e sur la requĂŞte et le contexte.""" + + # Formater le contexte + formatted_context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(context)]) + + # CrĂ©er le prompt + prompt =f"""Instructions: {SYSTEM_PROMPT} + + Contexte: {formatted_context} + + Question: {query} + + RĂ©ponse:""" + + # Appeler Ollama via l'API + try: + response = requests.post( + f"{self.base_url}/api/generate", + json={ + "model": self.model_name, + "prompt": prompt, + "stream": False, + "options": { + "temperature": 0.7, + "top_p": 0.9, + } + } + ) + + # VĂ©rifier le statut de la rĂ©ponse + response.raise_for_status() + + # Parser le JSON + result = response.json() + + # DEBUG: Afficher la structure de la rĂ©ponse + print(f"DEBUG - Structure de la rĂ©ponse: {result.keys()}") + + # VĂ©rifier les diffĂ©rentes clĂ©s possibles + if "response" in result: + return result["response"] + elif "message" in result: + return result["message"] + elif "content" in result: + return result["content"] + else: + # Si aucune clĂ© attendue n'est trouvĂ©e + print(f"DEBUG - RĂ©ponse complète: {result}") + return f"Erreur: Format de rĂ©ponse inattendu. ClĂ©s disponibles: {list(result.keys())}" + + except requests.exceptions.ConnectionError: + return "❌ Impossible de se connecter au serveur Ollama. VĂ©rifiez qu'Ollama est en cours d'exĂ©cution avec: ollama serve" + + except requests.exceptions.Timeout: + return "⚠️ La gĂ©nĂ©ration a pris trop de temps. Essayez avec un modèle plus petit." + + except requests.exceptions.HTTPError as e: + return f"❌ Erreur HTTP {response.status_code}: {e}" + + except Exception as e: + return f"❌ Erreur lors de la gĂ©nĂ©ration: {str(e)}" \ No newline at end of file diff --git a/src/impl/retriever.py b/src/impl/retriever.py new file mode 100644 index 0000000..671f939 --- /dev/null +++ b/src/impl/retriever.py @@ -0,0 +1,43 @@ +from typing import List +from ..interface.base_retriever import BaseRetriever +from ..interface.base_datastore import BaseDatastore +from sentence_transformers import CrossEncoder +import numpy as np + +class Retriever(BaseRetriever): + + def __init__(self, datastore: BaseDatastore): + self.datastore = datastore + self.model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') + + def search_retriever(self, query: str, top_k: int = 5) -> List[str]: + """Cherche dans le datastore et classe les rĂ©sultats par recherche sĂ©mantique + + Args: + query (str): RequĂŞte + top_k (int, optional): Nombre de rĂ©sultats Ă  retourner. Defaults to 5. + + Returns: + List[str]: Liste de content de docs classĂ©e + """ + search_results = self.datastore.search_datastore(query, top_k = top_k *5) + reranked_results = self._rerank(query, search_results, top_k) + return reranked_results + + + def _rerank(self, query: str, search_results: List[str], top_k: int=10) -> List[str]: + """Rerank le contenu des documents en fonction de la similaritĂ© avec la query + + Args: + query (str): requĂŞte + search_results (List[str]): liste de documents retrieved + top_k (int, optional): Nombre de documents Ă  retourner. Defaults to 10. + + Returns: + List[str]: Liste de documents classĂ©e + """ + pairs =[[query, doc]for doc in search_results] + scores = self.model.predict(pairs) + ranked_indices = np.argsort(scores)[::-1] + results = [search_results[idx] for idx in ranked_indices[:top_k]] + return results \ No newline at end of file diff --git a/src/interface/__init__.py b/src/interface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/interface/__pycache__/__init__.cpython-312.pyc b/src/interface/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..ffa7657 Binary files /dev/null and b/src/interface/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/interface/__pycache__/base_datastore.cpython-312.pyc b/src/interface/__pycache__/base_datastore.cpython-312.pyc new file mode 100644 index 0000000..46b4b2e Binary files /dev/null and b/src/interface/__pycache__/base_datastore.cpython-312.pyc differ diff --git a/src/interface/__pycache__/base_indexer.cpython-312.pyc b/src/interface/__pycache__/base_indexer.cpython-312.pyc new file mode 100644 index 0000000..93019e8 Binary files /dev/null and b/src/interface/__pycache__/base_indexer.cpython-312.pyc differ diff --git a/src/interface/__pycache__/base_response_generator.cpython-312.pyc b/src/interface/__pycache__/base_response_generator.cpython-312.pyc new file mode 100644 index 0000000..c337a24 Binary files /dev/null and b/src/interface/__pycache__/base_response_generator.cpython-312.pyc differ diff --git a/src/interface/__pycache__/base_retriever.cpython-312.pyc b/src/interface/__pycache__/base_retriever.cpython-312.pyc new file mode 100644 index 0000000..fb40cc4 Binary files /dev/null and b/src/interface/__pycache__/base_retriever.cpython-312.pyc differ diff --git a/src/interface/base_datastore.py b/src/interface/base_datastore.py new file mode 100644 index 0000000..be6bd86 --- /dev/null +++ b/src/interface/base_datastore.py @@ -0,0 +1,22 @@ +from abc import ABC,abstractmethod +from typing import List +from pydantic import BaseModel + + +class DataItem(BaseModel): + content: str = "" + source: str = "" + +class BaseDatastore(ABC): + + @abstractmethod + def add_items(self, items: List[DataItem]) -> None: + pass + + @abstractmethod + def create_vector(self, content: str) -> List[float]: + pass + + @abstractmethod + def search_datastore(self, query: str, top_k: int=5) -> List[str]: + pass \ No newline at end of file diff --git a/src/interface/base_evaluator.py b/src/interface/base_evaluator.py new file mode 100644 index 0000000..513cfc9 --- /dev/null +++ b/src/interface/base_evaluator.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod +from typing import Optional +from pydantic import BaseModel + +class EvaluationResult(BaseModel): + question: str + response: str + expected_answer: str + is_correct: bool + reasoning: Optional[str] = None + +class BaseEvaluator(ABC): + + @abstractmethod + def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult: + pass + diff --git a/src/interface/base_indexer.py b/src/interface/base_indexer.py new file mode 100644 index 0000000..8b39aca --- /dev/null +++ b/src/interface/base_indexer.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod +from typing import List + +from src.interface.base_datastore import DataItem + +class BaseIndexer(ABC): + + @abstractmethod + def index(self, document_paths: List[str]) -> List[DataItem]: + pass \ No newline at end of file diff --git a/src/interface/base_response_generator.py b/src/interface/base_response_generator.py new file mode 100644 index 0000000..7c86acd --- /dev/null +++ b/src/interface/base_response_generator.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from typing import List + +class BaseResponseGenerator(ABC): + + @abstractmethod + def generate_response(self, query: str, context: List[str]) -> str: + pass \ No newline at end of file diff --git a/src/interface/base_retriever.py b/src/interface/base_retriever.py new file mode 100644 index 0000000..e7b52f2 --- /dev/null +++ b/src/interface/base_retriever.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from typing import List + +class BaseRetriever(ABC): + + @abstractmethod + def search_retriever(self, query: str, top_k: int = 5) -> List[str]: + pass \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..89c8923 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/tests/__pycache__/test_rag.cpython-312-pytest-8.4.2.pyc b/tests/__pycache__/test_rag.cpython-312-pytest-8.4.2.pyc new file mode 100644 index 0000000..a550830 Binary files /dev/null and b/tests/__pycache__/test_rag.cpython-312-pytest-8.4.2.pyc differ diff --git a/tests/test_rag.py b/tests/test_rag.py new file mode 100644 index 0000000..7eea44d --- /dev/null +++ b/tests/test_rag.py @@ -0,0 +1,423 @@ +""" +Suite de tests unitaires pour le système RAG (Indexer, Datastore, Retriever) + +AdaptĂ© Ă  votre implĂ©mentation spĂ©cifique avec: +- BaseDatastore (pas BaseDataStore) +- Datastore (pas DataStore) +- DataItem avec content/source (pas text/metadata) + +Pour exĂ©cuter: + pytest tests/test_rag.py -v + +Pour exĂ©cuter avec couverture: + pytest tests/test_rag.py --cov=src --cov-report=html +""" + +import sys +from pathlib import Path + +# Configuration du PYTHONPATH +project_root = Path(__file__).parent.parent +if str(project_root) not in sys.path: + sys.path.insert(0, str(project_root)) + +import pytest +import tempfile +import shutil +from typing import List + +# Imports adaptĂ©s Ă  votre structure +from src.interface.base_datastore import BaseDatastore, DataItem +from src.impl.datastore import Datastore + +# Tentative d'import des autres composants (adapter selon vos fichiers) +try: + from src.interface.base_indexer import BaseIndexer + from src.impl.indexer import Indexer + HAS_INDEXER = True +except ImportError: + HAS_INDEXER = False + print("⚠️ Indexer non trouvĂ© - tests d'indexation dĂ©sactivĂ©s") + +try: + from src.interface.base_retriever import BaseRetriever + from src.impl.retriever import Retriever + HAS_RETRIEVER = True +except ImportError: + HAS_RETRIEVER = False + print("⚠️ Retriever non trouvĂ© - tests de rĂ©cupĂ©ration dĂ©sactivĂ©s") + + +# ============================================================================ +# FIXTURES - Configuration des tests +# ============================================================================ + +@pytest.fixture +def temp_dir(): + """CrĂ©e un rĂ©pertoire temporaire pour les tests.""" + temp_path = tempfile.mkdtemp() + yield temp_path + shutil.rmtree(temp_path, ignore_errors=True) + + +@pytest.fixture +def sample_items(): + """CrĂ©e des DataItems de test.""" + return [ + DataItem( + content="L'intelligence artificielle (IA) est un domaine de l'informatique.", + source="doc1.pdf" + ), + DataItem( + content="Python est un langage de programmation populaire pour l'IA.", + source="doc2.pdf" + ), + DataItem( + content="Les rĂ©seaux de neurones sont utilisĂ©s en deep learning.", + source="doc3.pdf" + ), + DataItem( + content="Le machine learning permet aux ordinateurs d'apprendre sans ĂŞtre explicitement programmĂ©s.", + source="doc4.pdf" + ), + ] + + +@pytest.fixture +def datastore(temp_dir, monkeypatch): + """CrĂ©e une instance du Datastore avec une base temporaire.""" + # Modifier le chemin de la DB pour les tests + test_db_path = str(Path(temp_dir) / "test-lancedb") + monkeypatch.setattr("src.impl.datastore.Datastore.DB_PATH", test_db_path) + + ds = Datastore() + ds.reset_table() # S'assurer que la table est vide + yield ds + + # Cleanup + try: + ds.vector_db.drop_table(ds.DB_TABLE_NAME) + except: + pass + + +@pytest.fixture +def populated_datastore(datastore, sample_items): + """Datastore prĂ©-rempli avec des items.""" + datastore.add_items(sample_items) + return datastore + + +@pytest.fixture +def indexer(): + """CrĂ©e une instance de l'Indexer si disponible.""" + if not HAS_INDEXER: + pytest.skip("Indexer non disponible") + return Indexer() + + +@pytest.fixture +def retriever(datastore): + """CrĂ©e une instance du Retriever si disponible.""" + if not HAS_RETRIEVER: + pytest.skip("Retriever non disponible") + return Retriever(datastore=datastore) + + +# ============================================================================ +# TESTS DATAITEM +# ============================================================================ + +class TestDataItem: + """Tests pour la classe DataItem.""" + + def test_dataitem_creation(self): + """Test la crĂ©ation d'un DataItem.""" + item = DataItem(content="Test content", source="test.pdf") + assert item.content == "Test content" + assert item.source == "test.pdf" + + def test_dataitem_default_values(self): + """Test les valeurs par dĂ©faut.""" + item = DataItem() + assert item.content == "" + assert item.source == "" + + def test_dataitem_validation(self): + """Test la validation Pydantic.""" + # Pydantic devrait accepter ces types + item = DataItem(content="text", source="source.pdf") + assert isinstance(item.content, str) + assert isinstance(item.source, str) + + +# ============================================================================ +# TESTS DATASTORE +# ============================================================================ + +class TestDatastore: + """Tests pour la classe Datastore.""" + + def test_datastore_initialization(self, datastore): + """Test l'initialisation du Datastore.""" + assert isinstance(datastore, BaseDatastore) + assert datastore.vector_dimensions == 384 + assert datastore.model is not None + assert datastore.table is not None + + def test_reset_table(self, datastore): + """Test le reset de la table.""" + # Ajouter des items + items = [DataItem(content="test", source="test.pdf")] + datastore.add_items(items) + + # Reset + table = datastore.reset_table() + assert table is not None + + # VĂ©rifier que la table est vide + results = datastore.search_datastore("test", top_k=10) + assert len(results) == 0 + + def test_add_single_item(self, datastore): + """Test l'ajout d'un seul item.""" + item = DataItem( + content="Test de l'ajout d'un item unique", + source="test_single.pdf" + ) + datastore.add_items([item]) + + # VĂ©rifier que l'item peut ĂŞtre retrouvĂ© + results = datastore.search_datastore("test ajout", top_k=5) + assert len(results) > 0 + + def test_add_multiple_items(self, datastore, sample_items): + """Test l'ajout de plusieurs items.""" + datastore.add_items(sample_items) + + # VĂ©rifier que plusieurs items ont Ă©tĂ© ajoutĂ©s + results = datastore.search_datastore("intelligence", top_k=10) + assert len(results) > 0 + + def test_add_empty_list(self, datastore): + """Test l'ajout d'une liste vide.""" + # Ne devrait pas crasher + datastore.add_items([]) + # Pas d'exception = succès + + def test_create_vector(self, datastore): + """Test la crĂ©ation de vecteurs.""" + vector = datastore.create_vector("Test content") + + assert isinstance(vector, list) + assert len(vector) == 384 # Dimension du modèle all-MiniLM-L6-v2 + assert all(isinstance(v, float) for v in vector) + + def test_create_vector_consistency(self, datastore): + """Test que le mĂŞme texte produit le mĂŞme vecteur.""" + text = "Texte de test pour la cohĂ©rence" + vector1 = datastore.create_vector(text) + vector2 = datastore.create_vector(text) + + # Les vecteurs devraient ĂŞtre identiques (ou très proches) + import numpy as np + similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2)) + assert similarity > 0.99 # Très haute similaritĂ© + + def test_search_basic(self, populated_datastore): + """Test une recherche basique.""" + results = populated_datastore.search_datastore("intelligence artificielle", top_k=3) + + assert len(results) > 0 + assert len(results) <= 3 + assert all(isinstance(r, str) for r in results) + + def test_search_relevance(self, populated_datastore): + """Test la pertinence des rĂ©sultats.""" + results = populated_datastore.search_datastore("Python programmation", top_k=5) + + assert len(results) > 0 + # Le premier rĂ©sultat devrait contenir "Python" + assert any("Python" in results[0] or "python" in results[0].lower() + for _ in [results[0]]) + + def test_search_top_k_limit(self, populated_datastore): + """Test que top_k limite correctement les rĂ©sultats.""" + results_2 = populated_datastore.search_datastore("test", top_k=2) + results_4 = populated_datastore.search_datastore("test", top_k=4) + + assert len(results_2) <= 2 + assert len(results_4) <= 4 + + def test_search_empty_query(self, populated_datastore): + """Test avec une requĂŞte vide.""" + results = populated_datastore.search_datastore("", top_k=5) + # Devrait retourner des rĂ©sultats ou liste vide, pas d'erreur + assert isinstance(results, list) + + def test_search_no_results(self, datastore): + """Test une recherche sur une base vide.""" + results = datastore.search_datastore("query inexistante", top_k=5) + assert results == [] + + def test_search_special_characters(self, populated_datastore): + """Test avec des caractères spĂ©ciaux.""" + results = populated_datastore.search_datastore("l'intelligence", top_k=5) + assert isinstance(results, list) + + def test_multiple_searches_consistency(self, populated_datastore): + """Test la cohĂ©rence sur plusieurs recherches.""" + query = "intelligence artificielle" + results1 = populated_datastore.search_datastore(query, top_k=3) + results2 = populated_datastore.search_datastore(query, top_k=3) + + # Les rĂ©sultats devraient ĂŞtre identiques + assert results1 == results2 + + def test_large_batch_add(self, datastore): + """Test l'ajout d'un grand nombre d'items.""" + large_batch = [ + DataItem(content=f"Document numĂ©ro {i} avec du contenu variĂ©.", source=f"doc_{i}.pdf") + for i in range(100) + ] + + # Ne devrait pas crasher + datastore.add_items(large_batch) + + # VĂ©rifier que des rĂ©sultats sont retournĂ©s + results = datastore.search_datastore("document", top_k=10) + assert len(results) > 0 + + +# ============================================================================ +# TESTS INDEXER (si disponible) +# ============================================================================ + +@pytest.mark.skipif(not HAS_INDEXER, reason="Indexer non disponible") +class TestIndexer: + """Tests pour la classe Indexer.""" + + def test_indexer_initialization(self, indexer): + """Test l'initialisation de l'Indexer.""" + assert isinstance(indexer, BaseIndexer) + + def test_index_documents(self, indexer, temp_dir): + """Test l'indexation de documents.""" + # CrĂ©er un document de test + doc_path = Path(temp_dir) / "test_doc.pdf" + doc_path.write_text("Contenu de test pour l'indexation.") + + items = indexer.index([str(doc_path)]) + + assert len(items) > 0 + assert all(isinstance(item, DataItem) for item in items) + assert all(hasattr(item, 'content') for item in items) + assert all(hasattr(item, 'source') for item in items) + + +# ============================================================================ +# TESTS RETRIEVER (si disponible) +# ============================================================================ + +@pytest.mark.skipif(not HAS_RETRIEVER, reason="Retriever non disponible") +class TestRetriever: + """Tests pour la classe Retriever.""" + + def test_retriever_initialization(self, retriever): + """Test l'initialisation du Retriever.""" + assert isinstance(retriever, BaseRetriever) + assert retriever.datastore is not None + + def test_retrieve_basic(self, retriever, populated_datastore): + """Test une rĂ©cupĂ©ration basique.""" + retriever.datastore = populated_datastore + results = retriever.search_retriever("intelligence artificielle", top_k=3) + + assert len(results) > 0 + assert len(results) <= 3 + + +# ============================================================================ +# TESTS D'INTÉGRATION +# ============================================================================ + +class TestIntegration: + """Tests d'intĂ©gration du système complet.""" + + @pytest.mark.skipif(not HAS_INDEXER, reason="Indexer requis") + def test_full_pipeline(self, indexer, datastore, temp_dir): + """Test du pipeline complet: indexation → stockage → recherche.""" + # 1. CrĂ©er un document + doc_path = Path(temp_dir) / "integration_test.pdf" + doc_path.write_text(""" + L'intelligence artificielle transforme le monde. + Python est le langage privilĂ©giĂ© pour l'IA. + Les algorithmes de machine learning sont puissants. + """) + + # 2. Indexation + items = indexer.index([str(doc_path)]) + assert len(items) > 0 + + # 3. Stockage + datastore.add_items(items) + + # 4. Recherche + results = datastore.search_datastore("intelligence artificielle Python", top_k=3) + assert len(results) > 0 + + # 5. VĂ©rifier la pertinence + top_result = results[0].lower() + assert "intelligence" in top_result or "python" in top_result + + def test_incremental_addition(self, datastore, sample_items): + """Test l'ajout incrĂ©mental de donnĂ©es.""" + # Ajouter en plusieurs fois + datastore.add_items(sample_items[:2]) + results_1 = datastore.search_datastore("test", top_k=10) + + datastore.add_items(sample_items[2:]) + results_2 = datastore.search_datastore("test", top_k=10) + + # Le second devrait avoir plus de rĂ©sultats potentiels + assert len(results_2) >= len(results_1) + + +# ============================================================================ +# TESTS DE ROBUSTESSE +# ============================================================================ + +class TestRobustness: + """Tests de robustesse et gestion d'erreurs.""" + + def test_unicode_content(self, datastore): + """Test avec du contenu Unicode.""" + items = [ + DataItem(content="Émojis: 🎉 🎨 🚀", source="unicode.pdf"), + DataItem(content="Caractères spĂ©ciaux: Ă© Ă  ç ñ", source="special.pdf"), + ] + + datastore.add_items(items) + results = datastore.search_datastore("Ă©mojis caractères", top_k=5) + assert isinstance(results, list) + + def test_very_long_content(self, datastore): + """Test avec du contenu très long.""" + long_content = "Test. " * 1000 # ~6000 caractères + item = DataItem(content=long_content, source="long.pdf") + + # Ne devrait pas crasher + datastore.add_items([item]) + results = datastore.search_datastore("test", top_k=1) + assert len(results) >= 0 + + def test_empty_content(self, datastore): + """Test avec du contenu vide.""" + items = [DataItem(content="", source="empty.pdf")] + + # Ne devrait pas crasher + datastore.add_items(items) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) \ No newline at end of file