Initial commit
This commit is contained in:
BIN
__pycache__/main.cpython-312.pyc
Normal file
BIN
__pycache__/main.cpython-312.pyc
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_latest.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_latest.manifest
Normal file
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
$8296f8ca-f94c-4570-a66a-d3dfbdac8cba<62>{2vector <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*fixed_size_list:float:38408"content <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string08!source <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string08
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/1.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/1.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/10.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/10.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/11.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/11.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/12.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/12.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/13.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/13.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/14.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/14.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/15.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/15.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/16.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/16.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/17.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/17.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/18.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/18.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/19.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/19.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/2.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/2.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/20.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/20.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/21.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/21.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/22.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/22.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/23.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/23.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/3.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/3.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/4.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/4.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/5.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/5.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/6.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/6.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/7.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/7.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/8.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/8.manifest
Normal file
Binary file not shown.
BIN
data/sample-lancedb/rag-table.lance/_versions/9.manifest
Normal file
BIN
data/sample-lancedb/rag-table.lance/_versions/9.manifest
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data/source/bilan_comptable_2024.pdf
Normal file
BIN
data/source/bilan_comptable_2024.pdf
Normal file
Binary file not shown.
BIN
data/source/database.pdf
Normal file
BIN
data/source/database.pdf
Normal file
Binary file not shown.
BIN
data/source/employes.pdf
Normal file
BIN
data/source/employes.pdf
Normal file
Binary file not shown.
BIN
data/source/facture_14_03_2025.pdf
Normal file
BIN
data/source/facture_14_03_2025.pdf
Normal file
Binary file not shown.
BIN
data/source/fournisseurs.pdf
Normal file
BIN
data/source/fournisseurs.pdf
Normal file
Binary file not shown.
BIN
data/source/historique_commandes.pdf
Normal file
BIN
data/source/historique_commandes.pdf
Normal file
Binary file not shown.
BIN
data/source/planning_production_mars_2025.pdf
Normal file
BIN
data/source/planning_production_mars_2025.pdf
Normal file
Binary file not shown.
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#executer pip install -r requirements.txt
|
||||||
|
numpy<2
|
||||||
|
pandas>=2.1.4,<3.0
|
||||||
|
scikit-learn
|
||||||
|
torch --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
transformers
|
||||||
|
accelerate>=1.2.1,<2.0.0
|
||||||
|
sentence-transformers
|
||||||
|
pyarrow==14.0.1
|
||||||
|
|
||||||
|
# Default Dependencies
|
||||||
|
pydantic>=2.0.0 # For data validation
|
||||||
|
lancedb==0.6.13
|
||||||
|
docling==2.31.0
|
||||||
|
cohere==5.15.0
|
||||||
|
|
||||||
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
BIN
src/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
0
src/impl/__init__.py
Normal file
0
src/impl/__init__.py
Normal file
BIN
src/impl/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/datastore.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/datastore.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/indexer.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/indexer.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/response_generator.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/response_generator.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/impl/__pycache__/retriever.cpython-312.pyc
Normal file
BIN
src/impl/__pycache__/retriever.cpython-312.pyc
Normal file
Binary file not shown.
163
src/impl/datastore.py
Normal file
163
src/impl/datastore.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
from typing import List
|
||||||
|
from ..interface.base_datastore import BaseDatastore, DataItem
|
||||||
|
import lancedb
|
||||||
|
from lancedb.table import Table
|
||||||
|
from typing import List
|
||||||
|
import pyarrow as pa
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import numpy as np
|
||||||
|
# from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
|
||||||
|
class Datastore(BaseDatastore):
|
||||||
|
|
||||||
|
DB_PATH = "data/sample-lancedb"
|
||||||
|
DB_TABLE_NAME = "rag-table"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Constructeur par défaut, initialise les dimensions des vecteurs pour l'embedding
|
||||||
|
(actuellement 384 par défaut pour le modèle all-MiniLm-L6-v2), charge le modèle SentenceTransformer,
|
||||||
|
connecte la base de données et récupère la table.
|
||||||
|
#Model's maximum sequence length = 256
|
||||||
|
"""
|
||||||
|
self.vector_dimensions = 384 # all-MiniLm-L6-v2 a une dimension fixe de 384
|
||||||
|
self.model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||||
|
self.vector_db = lancedb.connect(self.DB_PATH)
|
||||||
|
self.table: Table = self._get_table()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def reset_table(self) -> Table:
|
||||||
|
"""Drop la table si elle existe puis crée une table selon le schéma
|
||||||
|
vector (liste de float32 de dimension définie dans la classe), content et source
|
||||||
|
et l'ouvre.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Table: La table crée
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self.vector_db.drop_table(self.DB_TABLE_NAME)
|
||||||
|
except Exception as e:
|
||||||
|
print("Unable to drop the table, assuming it does not exist.")
|
||||||
|
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(),self.vector_dimensions)),
|
||||||
|
pa.field("content", pa.utf8()),
|
||||||
|
pa.field("source", pa.utf8()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.table = self.vector_db.create_table(self.DB_TABLE_NAME, schema = schema)
|
||||||
|
#self.table = self.vector_db.open_table(self.DB_TABLE_NAME)
|
||||||
|
print(f"Table was reset/created: {self.DB_TABLE_NAME} in {self.DB_PATH}")
|
||||||
|
return self.table
|
||||||
|
|
||||||
|
|
||||||
|
def _get_table(self) -> Table:
|
||||||
|
"""Ouvre la table ou la reset en cas d'échec
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Table: la table ouverte
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return self.vector_db.open_table(self.DB_TABLE_NAME)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error opening the table {e}. Trying to reset it.")
|
||||||
|
return self.reset_table()
|
||||||
|
|
||||||
|
def add_items(self, items: List[DataItem]) -> None:
|
||||||
|
"""Ajoute les items en entrées dans le dataset (nécessite un embedding)
|
||||||
|
opération network bound donc à paralléliser
|
||||||
|
Args:
|
||||||
|
items (List[DataItem]): Liste de DataItems à ajouter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
if not items:
|
||||||
|
return
|
||||||
|
|
||||||
|
contents = [item.content for item in items]
|
||||||
|
sources = [item.source for item in items]
|
||||||
|
|
||||||
|
"embedding du contenu de chaque entrée par batch de 32"
|
||||||
|
print(f"Génération des embeddings pour {len(items)} items...")
|
||||||
|
vectors = self.model.encode(
|
||||||
|
contents,
|
||||||
|
batch_size= 32,
|
||||||
|
)
|
||||||
|
|
||||||
|
"conversion en dictionnaires pour stocker les documents dans la BDD"
|
||||||
|
entries = [
|
||||||
|
{
|
||||||
|
"vector": vector,
|
||||||
|
"content": content,
|
||||||
|
"source": source
|
||||||
|
}
|
||||||
|
for vector, content, source in zip(vectors, contents, sources)
|
||||||
|
]
|
||||||
|
|
||||||
|
#self.table.merge_insert("source").when_matched_update_all().when_not_matched_insert_all().execute(entries)
|
||||||
|
self.table.add(entries)
|
||||||
|
print(f"{len(entries)} items ajoutés")
|
||||||
|
|
||||||
|
#deprecated
|
||||||
|
def _convert_items_to_entry(self, item: DataItem) -> dict:
|
||||||
|
"""Convertir un DataItem en dictionnaire correspondant au schéma du Datastore
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item (DataItem): item à convertir
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionnaire contenant le vecteur, le contenu et la source
|
||||||
|
"""
|
||||||
|
|
||||||
|
vector = self.create_vector(item.content)
|
||||||
|
|
||||||
|
return{
|
||||||
|
"vector": vector,
|
||||||
|
"content": item.content,
|
||||||
|
"source": item.source,
|
||||||
|
}
|
||||||
|
|
||||||
|
def create_vector(self, content: str) -> List[float]:
|
||||||
|
"""Utilise le modèle d'embedding pour convertir le str en vecteur (list[float])
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content (str): contenu de l'entrée à vectoriser
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[float]: vecteur renvoyé
|
||||||
|
"""
|
||||||
|
response = self.model.encode(content)
|
||||||
|
return response.tolist()
|
||||||
|
|
||||||
|
def search_datastore(self, query: str, top_k: int = 5) -> List[str]:
|
||||||
|
"""Embedde la query et lance une recherche
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): requête
|
||||||
|
top_k (int, optional): Nombre de documents à retourner au maximum. Defaults to 5.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: Tableau contenant le contenu des documents retrouvés
|
||||||
|
"""
|
||||||
|
|
||||||
|
vector = self.model.encode(query)
|
||||||
|
results = (
|
||||||
|
self.table
|
||||||
|
.search(vector, vector_column_name="vector")
|
||||||
|
.select(["content", "source"])
|
||||||
|
.limit(top_k)
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
|
||||||
|
result_content = [
|
||||||
|
result["content"]
|
||||||
|
for result in results
|
||||||
|
if "content" in result and result["content"] is not None]
|
||||||
|
return result_content
|
||||||
|
|
||||||
|
|
||||||
45
src/impl/evaluator.py
Normal file
45
src/impl/evaluator.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from ..interface.base_evaluator import EvaluationResult, BaseEvaluator
|
||||||
|
import requests
|
||||||
|
|
||||||
|
class Evaluator(BaseEvaluator):
|
||||||
|
|
||||||
|
def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.model_name = model_name
|
||||||
|
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """
|
||||||
|
You are a system that evaluates the correctness of a response to a question.
|
||||||
|
The question will be provided in <question>...</question> tags.
|
||||||
|
The response will be provided in <response>...</response> tags.
|
||||||
|
The expected answer will be provided in <expected_answer>...</expected_answer> tags.
|
||||||
|
|
||||||
|
The response doesn't have to exactly match all the words/context the expected answer. It just needs to be right about
|
||||||
|
the answer to the actual question itself.
|
||||||
|
|
||||||
|
Evaluate whether the response is correct or not, and return your reasoning in <reasoning>...</reasoning> tags.
|
||||||
|
Then return the result in <result>...</result> tags — either as 'true' or 'false'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult:
|
||||||
|
user_prompt = f"""
|
||||||
|
<questions> \n{query} </question>
|
||||||
|
<response> \n{response} </response>
|
||||||
|
<expected_answer> \n{expected_answer} </expected_answer>
|
||||||
|
"""
|
||||||
|
|
||||||
|
response_content = requests.post(
|
||||||
|
f"{self.base_url}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": self.model_name,
|
||||||
|
"prompt": user_prompt,
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.7,
|
||||||
|
"top_p": 0.9,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return super().evaluate(query, response, expected_answer)
|
||||||
|
|
||||||
|
|
||||||
101
src/impl/indexer.py
Normal file
101
src/impl/indexer.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
from typing import List
|
||||||
|
from src.interface.base_datastore import DataItem
|
||||||
|
from src.interface.base_indexer import BaseIndexer
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
||||||
|
import os
|
||||||
|
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
|
# pip install langchain langchain-text-splitters
|
||||||
|
|
||||||
|
|
||||||
|
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
|
MAX_TOKENS = 256 # set to a small number for illustrative purposes
|
||||||
|
#précedemment 512
|
||||||
|
|
||||||
|
class Indexer(BaseIndexer):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.converter = DocumentConverter()
|
||||||
|
self.tokenizer = HuggingFaceTokenizer(
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
|
||||||
|
max_tokens = MAX_TOKENS, # optional, by default derived from `tokenizer` for HF case
|
||||||
|
)
|
||||||
|
self.chunker = HybridChunker(
|
||||||
|
tokenizer = self.tokenizer,
|
||||||
|
max_tokens = MAX_TOKENS,
|
||||||
|
#merge_peers = True,
|
||||||
|
#handle_tables = "separate",
|
||||||
|
#handle_pictures = "separate"
|
||||||
|
)
|
||||||
|
"""self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
||||||
|
tokenizer = self.tokenizer,
|
||||||
|
chunk_size = MAX_TOKENS,
|
||||||
|
chunk_overlap = MAX_TOKENS // 10,
|
||||||
|
separators=["\n\n", "\n", ". ", " ", ""],
|
||||||
|
)"""
|
||||||
|
|
||||||
|
def index(self, document_paths: List[str]) -> List[DataItem]:
|
||||||
|
"""Convertit les documents en format docling puis les découpe en morceaux.
|
||||||
|
Les morceaux sont ensuite convertis sous forme de DataItem en y ajoutant des métadonnées
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_paths (List[str]): Liste des documents à indexer
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[DataItem]: Liste des DataItems indexés
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
for document_path in document_paths:
|
||||||
|
try:
|
||||||
|
document = self.converter.convert(document_path).document
|
||||||
|
chunks = list(self.chunker.chunk(document))
|
||||||
|
#chunks = self.text_splitter.split_text(document)
|
||||||
|
|
||||||
|
item = self._convert_to_DataItem(chunks, document_path)
|
||||||
|
|
||||||
|
items.extend(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Erreur lors du traitement de {document_path}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _convert_to_DataItem(self, chunks, document_path: str) -> List[DataItem]:
|
||||||
|
"""Crée une liste de DataItems en ajoutant les éventuels headers au début du contenu
|
||||||
|
et en récupérant la source à l'aide du module os
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks : liste de chunks de documents à traiter
|
||||||
|
document_path: chemin vers le document, permet d'utiliser os pour récupérer directement le nom
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Lits[DataItem]: liste de DataItem contenant les métadonnées et séparant le contenu et la source
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
|
||||||
|
try:
|
||||||
|
headings = ""
|
||||||
|
|
||||||
|
if(hasattr(chunk, 'meta') and hasattr(chunk.meta, 'headings') and chunk.meta.headings):
|
||||||
|
headings = "## " + ", ".join(chunk.meta.headings) + "\n"
|
||||||
|
|
||||||
|
text = chunk.text if(hasattr(chunk, 'text')) else str(chunk)
|
||||||
|
|
||||||
|
content = f"{headings}{text}"
|
||||||
|
|
||||||
|
filename = os.path.basename(document_path)
|
||||||
|
source = f"{filename}:chunk {i}"
|
||||||
|
|
||||||
|
item = DataItem(content = content, source = source)
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Erreur sur le chunk {i}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return items
|
||||||
83
src/impl/response_generator.py
Normal file
83
src/impl/response_generator.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
from typing import List
|
||||||
|
from ..interface.base_response_generator import BaseResponseGenerator
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Tu es un assistant intelligent qui répond aux questions en te basant sur le contexte fourni.
|
||||||
|
|
||||||
|
Règles importantes:
|
||||||
|
- Réponds UNIQUEMENT en te basant sur les informations du contexte
|
||||||
|
- Si l'information n'est pas dans le contexte, dis clairement "Je ne trouve pas cette information dans les documents fournis"
|
||||||
|
- Cite les sources quand c'est pertinent
|
||||||
|
- Réponds en français de manière claire et concise
|
||||||
|
- Ne réponds pas avec "Selon le document" mais donne directement l'information"""
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseGenerator(BaseResponseGenerator):
|
||||||
|
|
||||||
|
def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
|
||||||
|
self.model_name = model_name
|
||||||
|
self.base_url = base_url
|
||||||
|
|
||||||
|
def generate_response(self, query: str, context: List[str]) -> str:
|
||||||
|
"""Génère une réponse basée sur la requête et le contexte."""
|
||||||
|
|
||||||
|
# Formater le contexte
|
||||||
|
formatted_context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(context)])
|
||||||
|
|
||||||
|
# Créer le prompt
|
||||||
|
prompt =f"""Instructions: {SYSTEM_PROMPT}
|
||||||
|
|
||||||
|
Contexte: {formatted_context}
|
||||||
|
|
||||||
|
Question: {query}
|
||||||
|
|
||||||
|
Réponse:"""
|
||||||
|
|
||||||
|
# Appeler Ollama via l'API
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": self.model_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.7,
|
||||||
|
"top_p": 0.9,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Vérifier le statut de la réponse
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Parser le JSON
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
# DEBUG: Afficher la structure de la réponse
|
||||||
|
print(f"DEBUG - Structure de la réponse: {result.keys()}")
|
||||||
|
|
||||||
|
# Vérifier les différentes clés possibles
|
||||||
|
if "response" in result:
|
||||||
|
return result["response"]
|
||||||
|
elif "message" in result:
|
||||||
|
return result["message"]
|
||||||
|
elif "content" in result:
|
||||||
|
return result["content"]
|
||||||
|
else:
|
||||||
|
# Si aucune clé attendue n'est trouvée
|
||||||
|
print(f"DEBUG - Réponse complète: {result}")
|
||||||
|
return f"Erreur: Format de réponse inattendu. Clés disponibles: {list(result.keys())}"
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
return "❌ Impossible de se connecter au serveur Ollama. Vérifiez qu'Ollama est en cours d'exécution avec: ollama serve"
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
return "⚠️ La génération a pris trop de temps. Essayez avec un modèle plus petit."
|
||||||
|
|
||||||
|
except requests.exceptions.HTTPError as e:
|
||||||
|
return f"❌ Erreur HTTP {response.status_code}: {e}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"❌ Erreur lors de la génération: {str(e)}"
|
||||||
43
src/impl/retriever.py
Normal file
43
src/impl/retriever.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from typing import List
|
||||||
|
from ..interface.base_retriever import BaseRetriever
|
||||||
|
from ..interface.base_datastore import BaseDatastore
|
||||||
|
from sentence_transformers import CrossEncoder
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class Retriever(BaseRetriever):
|
||||||
|
|
||||||
|
def __init__(self, datastore: BaseDatastore):
|
||||||
|
self.datastore = datastore
|
||||||
|
self.model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
||||||
|
|
||||||
|
def search_retriever(self, query: str, top_k: int = 5) -> List[str]:
|
||||||
|
"""Cherche dans le datastore et classe les résultats par recherche sémantique
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): Requête
|
||||||
|
top_k (int, optional): Nombre de résultats à retourner. Defaults to 5.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: Liste de content de docs classée
|
||||||
|
"""
|
||||||
|
search_results = self.datastore.search_datastore(query, top_k = top_k *5)
|
||||||
|
reranked_results = self._rerank(query, search_results, top_k)
|
||||||
|
return reranked_results
|
||||||
|
|
||||||
|
|
||||||
|
def _rerank(self, query: str, search_results: List[str], top_k: int=10) -> List[str]:
|
||||||
|
"""Rerank le contenu des documents en fonction de la similarité avec la query
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): requête
|
||||||
|
search_results (List[str]): liste de documents retrieved
|
||||||
|
top_k (int, optional): Nombre de documents à retourner. Defaults to 10.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: Liste de documents classée
|
||||||
|
"""
|
||||||
|
pairs =[[query, doc]for doc in search_results]
|
||||||
|
scores = self.model.predict(pairs)
|
||||||
|
ranked_indices = np.argsort(scores)[::-1]
|
||||||
|
results = [search_results[idx] for idx in ranked_indices[:top_k]]
|
||||||
|
return results
|
||||||
0
src/interface/__init__.py
Normal file
0
src/interface/__init__.py
Normal file
BIN
src/interface/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/interface/__pycache__/base_datastore.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/base_datastore.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/interface/__pycache__/base_indexer.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/base_indexer.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
src/interface/__pycache__/base_retriever.cpython-312.pyc
Normal file
BIN
src/interface/__pycache__/base_retriever.cpython-312.pyc
Normal file
Binary file not shown.
22
src/interface/base_datastore.py
Normal file
22
src/interface/base_datastore.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from abc import ABC,abstractmethod
|
||||||
|
from typing import List
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class DataItem(BaseModel):
|
||||||
|
content: str = ""
|
||||||
|
source: str = ""
|
||||||
|
|
||||||
|
class BaseDatastore(ABC):
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_items(self, items: List[DataItem]) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def create_vector(self, content: str) -> List[float]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def search_datastore(self, query: str, top_k: int=5) -> List[str]:
|
||||||
|
pass
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user