Initial commit
This commit is contained in:
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
BIN
src/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
0
src/impl/__init__.py
Normal file
0
src/impl/__init__.py
Normal file
BIN
src/impl/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/datastore.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/datastore.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/indexer.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/indexer.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/response_generator.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/response_generator.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/retriever.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/retriever.cpython-312.pyc
Normal file
Binary file not shown.
163
src/impl/datastore.py
Normal file
163
src/impl/datastore.py
Normal file
@ -0,0 +1,163 @@
|
||||
from typing import List
|
||||
from ..interface.base_datastore import BaseDatastore, DataItem
|
||||
import lancedb
|
||||
from lancedb.table import Table
|
||||
from typing import List
|
||||
import pyarrow as pa
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
# from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
||||
class Datastore(BaseDatastore):
|
||||
|
||||
DB_PATH = "data/sample-lancedb"
|
||||
DB_TABLE_NAME = "rag-table"
|
||||
|
||||
def __init__(self):
|
||||
"""Constructeur par défaut, initialise les dimensions des vecteurs pour l'embedding
|
||||
(actuellement 384 par défaut pour le modèle all-MiniLm-L6-v2), charge le modèle SentenceTransformer,
|
||||
connecte la base de données et récupère la table.
|
||||
#Model's maximum sequence length = 256
|
||||
"""
|
||||
self.vector_dimensions = 384 # all-MiniLm-L6-v2 a une dimension fixe de 384
|
||||
self.model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
self.vector_db = lancedb.connect(self.DB_PATH)
|
||||
self.table: Table = self._get_table()
|
||||
|
||||
|
||||
|
||||
def reset_table(self) -> Table:
|
||||
"""Drop la table si elle existe puis crée une table selon le schéma
|
||||
vector (liste de float32 de dimension définie dans la classe), content et source
|
||||
et l'ouvre.
|
||||
|
||||
Returns:
|
||||
Table: La table crée
|
||||
"""
|
||||
try:
|
||||
self.vector_db.drop_table(self.DB_TABLE_NAME)
|
||||
except Exception as e:
|
||||
print("Unable to drop the table, assuming it does not exist.")
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(),self.vector_dimensions)),
|
||||
pa.field("content", pa.utf8()),
|
||||
pa.field("source", pa.utf8()),
|
||||
]
|
||||
)
|
||||
|
||||
self.table = self.vector_db.create_table(self.DB_TABLE_NAME, schema = schema)
|
||||
#self.table = self.vector_db.open_table(self.DB_TABLE_NAME)
|
||||
print(f"Table was reset/created: {self.DB_TABLE_NAME} in {self.DB_PATH}")
|
||||
return self.table
|
||||
|
||||
|
||||
def _get_table(self) -> Table:
|
||||
"""Ouvre la table ou la reset en cas d'échec
|
||||
|
||||
Returns:
|
||||
Table: la table ouverte
|
||||
"""
|
||||
try:
|
||||
return self.vector_db.open_table(self.DB_TABLE_NAME)
|
||||
except Exception as e:
|
||||
print(f"Error opening the table {e}. Trying to reset it.")
|
||||
return self.reset_table()
|
||||
|
||||
def add_items(self, items: List[DataItem]) -> None:
|
||||
"""Ajoute les items en entrées dans le dataset (nécessite un embedding)
|
||||
opération network bound donc à paralléliser
|
||||
Args:
|
||||
items (List[DataItem]): Liste de DataItems à ajouter
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
if not items:
|
||||
return
|
||||
|
||||
contents = [item.content for item in items]
|
||||
sources = [item.source for item in items]
|
||||
|
||||
"embedding du contenu de chaque entrée par batch de 32"
|
||||
print(f"Génération des embeddings pour {len(items)} items...")
|
||||
vectors = self.model.encode(
|
||||
contents,
|
||||
batch_size= 32,
|
||||
)
|
||||
|
||||
"conversion en dictionnaires pour stocker les documents dans la BDD"
|
||||
entries = [
|
||||
{
|
||||
"vector": vector,
|
||||
"content": content,
|
||||
"source": source
|
||||
}
|
||||
for vector, content, source in zip(vectors, contents, sources)
|
||||
]
|
||||
|
||||
#self.table.merge_insert("source").when_matched_update_all().when_not_matched_insert_all().execute(entries)
|
||||
self.table.add(entries)
|
||||
print(f"{len(entries)} items ajoutés")
|
||||
|
||||
#deprecated
|
||||
def _convert_items_to_entry(self, item: DataItem) -> dict:
|
||||
"""Convertir un DataItem en dictionnaire correspondant au schéma du Datastore
|
||||
|
||||
Args:
|
||||
item (DataItem): item à convertir
|
||||
|
||||
Returns:
|
||||
dict: Dictionnaire contenant le vecteur, le contenu et la source
|
||||
"""
|
||||
|
||||
vector = self.create_vector(item.content)
|
||||
|
||||
return{
|
||||
"vector": vector,
|
||||
"content": item.content,
|
||||
"source": item.source,
|
||||
}
|
||||
|
||||
def create_vector(self, content: str) -> List[float]:
|
||||
"""Utilise le modèle d'embedding pour convertir le str en vecteur (list[float])
|
||||
|
||||
Args:
|
||||
content (str): contenu de l'entrée à vectoriser
|
||||
|
||||
Returns:
|
||||
List[float]: vecteur renvoyé
|
||||
"""
|
||||
response = self.model.encode(content)
|
||||
return response.tolist()
|
||||
|
||||
def search_datastore(self, query: str, top_k: int = 5) -> List[str]:
|
||||
"""Embedde la query et lance une recherche
|
||||
|
||||
Args:
|
||||
query (str): requête
|
||||
top_k (int, optional): Nombre de documents à retourner au maximum. Defaults to 5.
|
||||
|
||||
Returns:
|
||||
List[str]: Tableau contenant le contenu des documents retrouvés
|
||||
"""
|
||||
|
||||
vector = self.model.encode(query)
|
||||
results = (
|
||||
self.table
|
||||
.search(vector, vector_column_name="vector")
|
||||
.select(["content", "source"])
|
||||
.limit(top_k)
|
||||
.to_list()
|
||||
)
|
||||
|
||||
result_content = [
|
||||
result["content"]
|
||||
for result in results
|
||||
if "content" in result and result["content"] is not None]
|
||||
return result_content
|
||||
|
||||
|
||||
45
src/impl/evaluator.py
Normal file
45
src/impl/evaluator.py
Normal file
@ -0,0 +1,45 @@
|
||||
from ..interface.base_evaluator import EvaluationResult, BaseEvaluator
|
||||
import requests
|
||||
|
||||
class Evaluator(BaseEvaluator):
|
||||
|
||||
def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
|
||||
self.base_url = base_url
|
||||
self.model_name = model_name
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
You are a system that evaluates the correctness of a response to a question.
|
||||
The question will be provided in <question>...</question> tags.
|
||||
The response will be provided in <response>...</response> tags.
|
||||
The expected answer will be provided in <expected_answer>...</expected_answer> tags.
|
||||
|
||||
The response doesn't have to exactly match all the words/context the expected answer. It just needs to be right about
|
||||
the answer to the actual question itself.
|
||||
|
||||
Evaluate whether the response is correct or not, and return your reasoning in <reasoning>...</reasoning> tags.
|
||||
Then return the result in <result>...</result> tags — either as 'true' or 'false'.
|
||||
"""
|
||||
|
||||
def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult:
|
||||
user_prompt = f"""
|
||||
<questions> \n{query} </question>
|
||||
<response> \n{response} </response>
|
||||
<expected_answer> \n{expected_answer} </expected_answer>
|
||||
"""
|
||||
|
||||
response_content = requests.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"prompt": user_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.9,
|
||||
}
|
||||
}
|
||||
|
||||
return super().evaluate(query, response, expected_answer)
|
||||
|
||||
|
||||
101
src/impl/indexer.py
Normal file
101
src/impl/indexer.py
Normal file
@ -0,0 +1,101 @@
|
||||
from typing import List
|
||||
from src.interface.base_datastore import DataItem
|
||||
from src.interface.base_indexer import BaseIndexer
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
||||
import os
|
||||
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
|
||||
from transformers import AutoTokenizer
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
# pip install langchain langchain-text-splitters
|
||||
|
||||
|
||||
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
MAX_TOKENS = 256 # set to a small number for illustrative purposes
|
||||
#précedemment 512
|
||||
|
||||
class Indexer(BaseIndexer):
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.converter = DocumentConverter()
|
||||
self.tokenizer = HuggingFaceTokenizer(
|
||||
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
|
||||
max_tokens = MAX_TOKENS, # optional, by default derived from `tokenizer` for HF case
|
||||
)
|
||||
self.chunker = HybridChunker(
|
||||
tokenizer = self.tokenizer,
|
||||
max_tokens = MAX_TOKENS,
|
||||
#merge_peers = True,
|
||||
#handle_tables = "separate",
|
||||
#handle_pictures = "separate"
|
||||
)
|
||||
"""self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer = self.tokenizer,
|
||||
chunk_size = MAX_TOKENS,
|
||||
chunk_overlap = MAX_TOKENS // 10,
|
||||
separators=["\n\n", "\n", ". ", " ", ""],
|
||||
)"""
|
||||
|
||||
def index(self, document_paths: List[str]) -> List[DataItem]:
|
||||
"""Convertit les documents en format docling puis les découpe en morceaux.
|
||||
Les morceaux sont ensuite convertis sous forme de DataItem en y ajoutant des métadonnées
|
||||
|
||||
Args:
|
||||
document_paths (List[str]): Liste des documents à indexer
|
||||
|
||||
Returns:
|
||||
List[DataItem]: Liste des DataItems indexés
|
||||
"""
|
||||
items = []
|
||||
for document_path in document_paths:
|
||||
try:
|
||||
document = self.converter.convert(document_path).document
|
||||
chunks = list(self.chunker.chunk(document))
|
||||
#chunks = self.text_splitter.split_text(document)
|
||||
|
||||
item = self._convert_to_DataItem(chunks, document_path)
|
||||
|
||||
items.extend(item)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Erreur lors du traitement de {document_path}: {e}")
|
||||
continue
|
||||
|
||||
return items
|
||||
|
||||
def _convert_to_DataItem(self, chunks, document_path: str) -> List[DataItem]:
|
||||
"""Crée une liste de DataItems en ajoutant les éventuels headers au début du contenu
|
||||
et en récupérant la source à l'aide du module os
|
||||
|
||||
Args:
|
||||
chunks : liste de chunks de documents à traiter
|
||||
document_path: chemin vers le document, permet d'utiliser os pour récupérer directement le nom
|
||||
|
||||
Returns:
|
||||
Lits[DataItem]: liste de DataItem contenant les métadonnées et séparant le contenu et la source
|
||||
"""
|
||||
items = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
|
||||
try:
|
||||
headings = ""
|
||||
|
||||
if(hasattr(chunk, 'meta') and hasattr(chunk.meta, 'headings') and chunk.meta.headings):
|
||||
headings = "## " + ", ".join(chunk.meta.headings) + "\n"
|
||||
|
||||
text = chunk.text if(hasattr(chunk, 'text')) else str(chunk)
|
||||
|
||||
content = f"{headings}{text}"
|
||||
|
||||
filename = os.path.basename(document_path)
|
||||
source = f"{filename}:chunk {i}"
|
||||
|
||||
item = DataItem(content = content, source = source)
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Erreur sur le chunk {i}: {e}")
|
||||
continue
|
||||
|
||||
return items
|
||||
83
src/impl/response_generator.py
Normal file
83
src/impl/response_generator.py
Normal file
@ -0,0 +1,83 @@
|
||||
from typing import List
|
||||
from ..interface.base_response_generator import BaseResponseGenerator
|
||||
import requests
|
||||
import json
|
||||
|
||||
SYSTEM_PROMPT = """Tu es un assistant intelligent qui répond aux questions en te basant sur le contexte fourni.
|
||||
|
||||
Règles importantes:
|
||||
- Réponds UNIQUEMENT en te basant sur les informations du contexte
|
||||
- Si l'information n'est pas dans le contexte, dis clairement "Je ne trouve pas cette information dans les documents fournis"
|
||||
- Cite les sources quand c'est pertinent
|
||||
- Réponds en français de manière claire et concise
|
||||
- Ne réponds pas avec "Selon le document" mais donne directement l'information"""
|
||||
|
||||
|
||||
class ResponseGenerator(BaseResponseGenerator):
|
||||
|
||||
def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
|
||||
self.model_name = model_name
|
||||
self.base_url = base_url
|
||||
|
||||
def generate_response(self, query: str, context: List[str]) -> str:
|
||||
"""Génère une réponse basée sur la requête et le contexte."""
|
||||
|
||||
# Formater le contexte
|
||||
formatted_context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(context)])
|
||||
|
||||
# Créer le prompt
|
||||
prompt =f"""Instructions: {SYSTEM_PROMPT}
|
||||
|
||||
Contexte: {formatted_context}
|
||||
|
||||
Question: {query}
|
||||
|
||||
Réponse:"""
|
||||
|
||||
# Appeler Ollama via l'API
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.9,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Vérifier le statut de la réponse
|
||||
response.raise_for_status()
|
||||
|
||||
# Parser le JSON
|
||||
result = response.json()
|
||||
|
||||
# DEBUG: Afficher la structure de la réponse
|
||||
print(f"DEBUG - Structure de la réponse: {result.keys()}")
|
||||
|
||||
# Vérifier les différentes clés possibles
|
||||
if "response" in result:
|
||||
return result["response"]
|
||||
elif "message" in result:
|
||||
return result["message"]
|
||||
elif "content" in result:
|
||||
return result["content"]
|
||||
else:
|
||||
# Si aucune clé attendue n'est trouvée
|
||||
print(f"DEBUG - Réponse complète: {result}")
|
||||
return f"Erreur: Format de réponse inattendu. Clés disponibles: {list(result.keys())}"
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
return "❌ Impossible de se connecter au serveur Ollama. Vérifiez qu'Ollama est en cours d'exécution avec: ollama serve"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return "⚠️ La génération a pris trop de temps. Essayez avec un modèle plus petit."
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
return f"❌ Erreur HTTP {response.status_code}: {e}"
|
||||
|
||||
except Exception as e:
|
||||
return f"❌ Erreur lors de la génération: {str(e)}"
|
||||
43
src/impl/retriever.py
Normal file
43
src/impl/retriever.py
Normal file
@ -0,0 +1,43 @@
|
||||
from typing import List
|
||||
from ..interface.base_retriever import BaseRetriever
|
||||
from ..interface.base_datastore import BaseDatastore
|
||||
from sentence_transformers import CrossEncoder
|
||||
import numpy as np
|
||||
|
||||
class Retriever(BaseRetriever):
|
||||
|
||||
def __init__(self, datastore: BaseDatastore):
|
||||
self.datastore = datastore
|
||||
self.model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
||||
|
||||
def search_retriever(self, query: str, top_k: int = 5) -> List[str]:
|
||||
"""Cherche dans le datastore et classe les résultats par recherche sémantique
|
||||
|
||||
Args:
|
||||
query (str): Requête
|
||||
top_k (int, optional): Nombre de résultats à retourner. Defaults to 5.
|
||||
|
||||
Returns:
|
||||
List[str]: Liste de content de docs classée
|
||||
"""
|
||||
search_results = self.datastore.search_datastore(query, top_k = top_k *5)
|
||||
reranked_results = self._rerank(query, search_results, top_k)
|
||||
return reranked_results
|
||||
|
||||
|
||||
def _rerank(self, query: str, search_results: List[str], top_k: int=10) -> List[str]:
|
||||
"""Rerank le contenu des documents en fonction de la similarité avec la query
|
||||
|
||||
Args:
|
||||
query (str): requête
|
||||
search_results (List[str]): liste de documents retrieved
|
||||
top_k (int, optional): Nombre de documents à retourner. Defaults to 10.
|
||||
|
||||
Returns:
|
||||
List[str]: Liste de documents classée
|
||||
"""
|
||||
pairs =[[query, doc]for doc in search_results]
|
||||
scores = self.model.predict(pairs)
|
||||
ranked_indices = np.argsort(scores)[::-1]
|
||||
results = [search_results[idx] for idx in ranked_indices[:top_k]]
|
||||
return results
|
||||
0
src/interface/__init__.py
Normal file
0
src/interface/__init__.py
Normal file
BIN
src/interface/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/interface/__pycache__/base_datastore.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/base_datastore.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/interface/__pycache__/base_indexer.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/base_indexer.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
src/interface/__pycache__/base_retriever.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/base_retriever.cpython-312.pyc
Normal file
Binary file not shown.
22
src/interface/base_datastore.py
Normal file
22
src/interface/base_datastore.py
Normal file
@ -0,0 +1,22 @@
|
||||
from abc import ABC,abstractmethod
|
||||
from typing import List
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DataItem(BaseModel):
|
||||
content: str = ""
|
||||
source: str = ""
|
||||
|
||||
class BaseDatastore(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def add_items(self, items: List[DataItem]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def create_vector(self, content: str) -> List[float]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def search_datastore(self, query: str, top_k: int=5) -> List[str]:
|
||||
pass
|
||||
17
src/interface/base_evaluator.py
Normal file
17
src/interface/base_evaluator.py
Normal file
@ -0,0 +1,17 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
class EvaluationResult(BaseModel):
|
||||
question: str
|
||||
response: str
|
||||
expected_answer: str
|
||||
is_correct: bool
|
||||
reasoning: Optional[str] = None
|
||||
|
||||
class BaseEvaluator(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult:
|
||||
pass
|
||||
|
||||
10
src/interface/base_indexer.py
Normal file
10
src/interface/base_indexer.py
Normal file
@ -0,0 +1,10 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
from src.interface.base_datastore import DataItem
|
||||
|
||||
class BaseIndexer(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def index(self, document_paths: List[str]) -> List[DataItem]:
|
||||
pass
|
||||
8
src/interface/base_response_generator.py
Normal file
8
src/interface/base_response_generator.py
Normal file
@ -0,0 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
class BaseResponseGenerator(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def generate_response(self, query: str, context: List[str]) -> str:
|
||||
pass
|
||||
8
src/interface/base_retriever.py
Normal file
8
src/interface/base_retriever.py
Normal file
@ -0,0 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
class BaseRetriever(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def search_retriever(self, query: str, top_k: int = 5) -> List[str]:
|
||||
pass
|
||||
Reference in New Issue
Block a user