Initial commit

This commit is contained in:
2025-11-03 18:20:12 +01:00
commit 98f6a7b3c0
108 changed files with 987 additions and 0 deletions

BIN
.coverage Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
$8296f8ca-f94c-4570-a66a-d3dfbdac8cba<62>{2vector <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*fixed_size_list:float:38408"content <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string08!source <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string08

Binary file not shown.

BIN
data/source/database.pdf Normal file

Binary file not shown.

BIN
data/source/employes.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

47
main.py Normal file

File diff suppressed because one or more lines are too long

16
requirements.txt Normal file
View File

@ -0,0 +1,16 @@
#executer pip install -r requirements.txt
numpy<2
pandas>=2.1.4,<3.0
scikit-learn
torch --index-url https://download.pytorch.org/whl/cpu
transformers
accelerate>=1.2.1,<2.0.0
sentence-transformers
pyarrow==14.0.1
# Default Dependencies
pydantic>=2.0.0 # For data validation
lancedb==0.6.13
docling==2.31.0
cohere==5.15.0

0
src/__init__.py Normal file
View File

Binary file not shown.

0
src/impl/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

163
src/impl/datastore.py Normal file
View File

@ -0,0 +1,163 @@
from typing import List
from ..interface.base_datastore import BaseDatastore, DataItem
import lancedb
from lancedb.table import Table
from typing import List
import pyarrow as pa
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# from concurrent.futures import ThreadPoolExecutor
class Datastore(BaseDatastore):
DB_PATH = "data/sample-lancedb"
DB_TABLE_NAME = "rag-table"
def __init__(self):
"""Constructeur par défaut, initialise les dimensions des vecteurs pour l'embedding
(actuellement 384 par défaut pour le modèle all-MiniLm-L6-v2), charge le modèle SentenceTransformer,
connecte la base de données et récupère la table.
#Model's maximum sequence length = 256
"""
self.vector_dimensions = 384 # all-MiniLm-L6-v2 a une dimension fixe de 384
self.model = SentenceTransformer("all-MiniLM-L6-v2")
self.vector_db = lancedb.connect(self.DB_PATH)
self.table: Table = self._get_table()
def reset_table(self) -> Table:
"""Drop la table si elle existe puis crée une table selon le schéma
vector (liste de float32 de dimension définie dans la classe), content et source
et l'ouvre.
Returns:
Table: La table crée
"""
try:
self.vector_db.drop_table(self.DB_TABLE_NAME)
except Exception as e:
print("Unable to drop the table, assuming it does not exist.")
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(),self.vector_dimensions)),
pa.field("content", pa.utf8()),
pa.field("source", pa.utf8()),
]
)
self.table = self.vector_db.create_table(self.DB_TABLE_NAME, schema = schema)
#self.table = self.vector_db.open_table(self.DB_TABLE_NAME)
print(f"Table was reset/created: {self.DB_TABLE_NAME} in {self.DB_PATH}")
return self.table
def _get_table(self) -> Table:
"""Ouvre la table ou la reset en cas d'échec
Returns:
Table: la table ouverte
"""
try:
return self.vector_db.open_table(self.DB_TABLE_NAME)
except Exception as e:
print(f"Error opening the table {e}. Trying to reset it.")
return self.reset_table()
def add_items(self, items: List[DataItem]) -> None:
"""Ajoute les items en entrées dans le dataset (nécessite un embedding)
opération network bound donc à paralléliser
Args:
items (List[DataItem]): Liste de DataItems à ajouter
Returns:
_type_: _description_
"""
if not items:
return
contents = [item.content for item in items]
sources = [item.source for item in items]
"embedding du contenu de chaque entrée par batch de 32"
print(f"Génération des embeddings pour {len(items)} items...")
vectors = self.model.encode(
contents,
batch_size= 32,
)
"conversion en dictionnaires pour stocker les documents dans la BDD"
entries = [
{
"vector": vector,
"content": content,
"source": source
}
for vector, content, source in zip(vectors, contents, sources)
]
#self.table.merge_insert("source").when_matched_update_all().when_not_matched_insert_all().execute(entries)
self.table.add(entries)
print(f"{len(entries)} items ajoutés")
#deprecated
def _convert_items_to_entry(self, item: DataItem) -> dict:
"""Convertir un DataItem en dictionnaire correspondant au schéma du Datastore
Args:
item (DataItem): item à convertir
Returns:
dict: Dictionnaire contenant le vecteur, le contenu et la source
"""
vector = self.create_vector(item.content)
return{
"vector": vector,
"content": item.content,
"source": item.source,
}
def create_vector(self, content: str) -> List[float]:
"""Utilise le modèle d'embedding pour convertir le str en vecteur (list[float])
Args:
content (str): contenu de l'entrée à vectoriser
Returns:
List[float]: vecteur renvoyé
"""
response = self.model.encode(content)
return response.tolist()
def search_datastore(self, query: str, top_k: int = 5) -> List[str]:
"""Embedde la query et lance une recherche
Args:
query (str): requête
top_k (int, optional): Nombre de documents à retourner au maximum. Defaults to 5.
Returns:
List[str]: Tableau contenant le contenu des documents retrouvés
"""
vector = self.model.encode(query)
results = (
self.table
.search(vector, vector_column_name="vector")
.select(["content", "source"])
.limit(top_k)
.to_list()
)
result_content = [
result["content"]
for result in results
if "content" in result and result["content"] is not None]
return result_content

45
src/impl/evaluator.py Normal file
View File

@ -0,0 +1,45 @@
from ..interface.base_evaluator import EvaluationResult, BaseEvaluator
import requests
class Evaluator(BaseEvaluator):
def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
self.base_url = base_url
self.model_name = model_name
SYSTEM_PROMPT = """
You are a system that evaluates the correctness of a response to a question.
The question will be provided in <question>...</question> tags.
The response will be provided in <response>...</response> tags.
The expected answer will be provided in <expected_answer>...</expected_answer> tags.
The response doesn't have to exactly match all the words/context the expected answer. It just needs to be right about
the answer to the actual question itself.
Evaluate whether the response is correct or not, and return your reasoning in <reasoning>...</reasoning> tags.
Then return the result in <result>...</result> tags — either as 'true' or 'false'.
"""
def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult:
user_prompt = f"""
<questions> \n{query} </question>
<response> \n{response} </response>
<expected_answer> \n{expected_answer} </expected_answer>
"""
response_content = requests.post(
f"{self.base_url}/api/generate",
json={
"model": self.model_name,
"prompt": user_prompt,
"stream": False,
"options": {
"temperature": 0.7,
"top_p": 0.9,
}
}
return super().evaluate(query, response, expected_answer)

101
src/impl/indexer.py Normal file
View File

@ -0,0 +1,101 @@
from typing import List
from src.interface.base_datastore import DataItem
from src.interface.base_indexer import BaseIndexer
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
import os
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer
from langchain_text_splitters import RecursiveCharacterTextSplitter
# pip install langchain langchain-text-splitters
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 256 # set to a small number for illustrative purposes
#précedemment 512
class Indexer(BaseIndexer):
def __init__(self):
self.converter = DocumentConverter()
self.tokenizer = HuggingFaceTokenizer(
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
max_tokens = MAX_TOKENS, # optional, by default derived from `tokenizer` for HF case
)
self.chunker = HybridChunker(
tokenizer = self.tokenizer,
max_tokens = MAX_TOKENS,
#merge_peers = True,
#handle_tables = "separate",
#handle_pictures = "separate"
)
"""self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
tokenizer = self.tokenizer,
chunk_size = MAX_TOKENS,
chunk_overlap = MAX_TOKENS // 10,
separators=["\n\n", "\n", ". ", " ", ""],
)"""
def index(self, document_paths: List[str]) -> List[DataItem]:
"""Convertit les documents en format docling puis les découpe en morceaux.
Les morceaux sont ensuite convertis sous forme de DataItem en y ajoutant des métadonnées
Args:
document_paths (List[str]): Liste des documents à indexer
Returns:
List[DataItem]: Liste des DataItems indexés
"""
items = []
for document_path in document_paths:
try:
document = self.converter.convert(document_path).document
chunks = list(self.chunker.chunk(document))
#chunks = self.text_splitter.split_text(document)
item = self._convert_to_DataItem(chunks, document_path)
items.extend(item)
except Exception as e:
print(f"Erreur lors du traitement de {document_path}: {e}")
continue
return items
def _convert_to_DataItem(self, chunks, document_path: str) -> List[DataItem]:
"""Crée une liste de DataItems en ajoutant les éventuels headers au début du contenu
et en récupérant la source à l'aide du module os
Args:
chunks : liste de chunks de documents à traiter
document_path: chemin vers le document, permet d'utiliser os pour récupérer directement le nom
Returns:
Lits[DataItem]: liste de DataItem contenant les métadonnées et séparant le contenu et la source
"""
items = []
for i, chunk in enumerate(chunks):
try:
headings = ""
if(hasattr(chunk, 'meta') and hasattr(chunk.meta, 'headings') and chunk.meta.headings):
headings = "## " + ", ".join(chunk.meta.headings) + "\n"
text = chunk.text if(hasattr(chunk, 'text')) else str(chunk)
content = f"{headings}{text}"
filename = os.path.basename(document_path)
source = f"{filename}:chunk {i}"
item = DataItem(content = content, source = source)
items.append(item)
except Exception as e:
print(f" Erreur sur le chunk {i}: {e}")
continue
return items

View File

@ -0,0 +1,83 @@
from typing import List
from ..interface.base_response_generator import BaseResponseGenerator
import requests
import json
SYSTEM_PROMPT = """Tu es un assistant intelligent qui répond aux questions en te basant sur le contexte fourni.
Règles importantes:
- Réponds UNIQUEMENT en te basant sur les informations du contexte
- Si l'information n'est pas dans le contexte, dis clairement "Je ne trouve pas cette information dans les documents fournis"
- Cite les sources quand c'est pertinent
- Réponds en français de manière claire et concise
- Ne réponds pas avec "Selon le document" mais donne directement l'information"""
class ResponseGenerator(BaseResponseGenerator):
def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
self.model_name = model_name
self.base_url = base_url
def generate_response(self, query: str, context: List[str]) -> str:
"""Génère une réponse basée sur la requête et le contexte."""
# Formater le contexte
formatted_context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(context)])
# Créer le prompt
prompt =f"""Instructions: {SYSTEM_PROMPT}
Contexte: {formatted_context}
Question: {query}
Réponse:"""
# Appeler Ollama via l'API
try:
response = requests.post(
f"{self.base_url}/api/generate",
json={
"model": self.model_name,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.7,
"top_p": 0.9,
}
}
)
# Vérifier le statut de la réponse
response.raise_for_status()
# Parser le JSON
result = response.json()
# DEBUG: Afficher la structure de la réponse
print(f"DEBUG - Structure de la réponse: {result.keys()}")
# Vérifier les différentes clés possibles
if "response" in result:
return result["response"]
elif "message" in result:
return result["message"]
elif "content" in result:
return result["content"]
else:
# Si aucune clé attendue n'est trouvée
print(f"DEBUG - Réponse complète: {result}")
return f"Erreur: Format de réponse inattendu. Clés disponibles: {list(result.keys())}"
except requests.exceptions.ConnectionError:
return "❌ Impossible de se connecter au serveur Ollama. Vérifiez qu'Ollama est en cours d'exécution avec: ollama serve"
except requests.exceptions.Timeout:
return "⚠️ La génération a pris trop de temps. Essayez avec un modèle plus petit."
except requests.exceptions.HTTPError as e:
return f"❌ Erreur HTTP {response.status_code}: {e}"
except Exception as e:
return f"❌ Erreur lors de la génération: {str(e)}"

43
src/impl/retriever.py Normal file
View File

@ -0,0 +1,43 @@
from typing import List
from ..interface.base_retriever import BaseRetriever
from ..interface.base_datastore import BaseDatastore
from sentence_transformers import CrossEncoder
import numpy as np
class Retriever(BaseRetriever):
def __init__(self, datastore: BaseDatastore):
self.datastore = datastore
self.model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
def search_retriever(self, query: str, top_k: int = 5) -> List[str]:
"""Cherche dans le datastore et classe les résultats par recherche sémantique
Args:
query (str): Requête
top_k (int, optional): Nombre de résultats à retourner. Defaults to 5.
Returns:
List[str]: Liste de content de docs classée
"""
search_results = self.datastore.search_datastore(query, top_k = top_k *5)
reranked_results = self._rerank(query, search_results, top_k)
return reranked_results
def _rerank(self, query: str, search_results: List[str], top_k: int=10) -> List[str]:
"""Rerank le contenu des documents en fonction de la similarité avec la query
Args:
query (str): requête
search_results (List[str]): liste de documents retrieved
top_k (int, optional): Nombre de documents à retourner. Defaults to 10.
Returns:
List[str]: Liste de documents classée
"""
pairs =[[query, doc]for doc in search_results]
scores = self.model.predict(pairs)
ranked_indices = np.argsort(scores)[::-1]
results = [search_results[idx] for idx in ranked_indices[:top_k]]
return results

View File

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,22 @@
from abc import ABC,abstractmethod
from typing import List
from pydantic import BaseModel
class DataItem(BaseModel):
content: str = ""
source: str = ""
class BaseDatastore(ABC):
@abstractmethod
def add_items(self, items: List[DataItem]) -> None:
pass
@abstractmethod
def create_vector(self, content: str) -> List[float]:
pass
@abstractmethod
def search_datastore(self, query: str, top_k: int=5) -> List[str]:
pass

Some files were not shown because too many files have changed in this diff Show More