diff --git a/.gitignore b/.gitignore
index f64ffe5..391c593 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ venv/
# Logs
*.log
+.env
diff --git a/create_parser.py b/create_parser.py
new file mode 100644
index 0000000..0e2cd5c
--- /dev/null
+++ b/create_parser.py
@@ -0,0 +1,63 @@
+import argparse
+
+def create_parser():
+ parser = argparse.ArgumentParser(
+ description="RAG pipeline CLI"
+ )
+
+ path_arg_parent = argparse.ArgumentParser(add_help=False)
+ path_arg_parent.add_argument(
+ "-p",
+ "--path",
+ type=str,
+ required=False,
+ help="Path to a directory containing documents to index.",
+ )
+
+ eval_file_arg_parent = argparse.ArgumentParser(add_help=False)
+ eval_file_arg_parent.add_argument(
+ "-f",
+ "--eval_file",
+ type=str,
+ required=False,
+ help="Path to a .json file with question/expected_answer pairs.",
+ )
+
+
+ subparsers = parser.add_subparsers(dest="commands", help="Enter a command", required=True)
+
+ subparsers.add_parser(
+ "run",
+ help="Run the full pipeline, i.e Reset, add and evaluate",
+ parents=[path_arg_parent, eval_file_arg_parent]
+ )
+
+ subparsers.add_parser(
+ "reset",
+ help="Reset the pipeline"
+ )
+
+ subparsers.add_parser(
+ "add",
+ help="Add the documents to the databaser",
+ parents=[path_arg_parent]
+ )
+
+ subparsers.add_parser(
+ "evaluate",
+ help="Evaluate the model",
+ parents=[eval_file_arg_parent]
+ )
+
+ query_parser = subparsers.add_parser(
+ "query",
+ help="Query the documents"
+ )
+
+ query_parser.add_argument(
+ "prompt",
+ type=str,
+ help="Enter the query"
+ )
+
+ return parser
\ No newline at end of file
diff --git a/data/source/bilan_comptable_2024.pdf b/data/source/bilan_comptable_2024.pdf
deleted file mode 100644
index 9ae35ae..0000000
Binary files a/data/source/bilan_comptable_2024.pdf and /dev/null differ
diff --git a/data/source/database.pdf b/data/source/database.pdf
deleted file mode 100644
index 141948d..0000000
Binary files a/data/source/database.pdf and /dev/null differ
diff --git a/data/source/employes.pdf b/data/source/employes.pdf
deleted file mode 100644
index fa0f565..0000000
Binary files a/data/source/employes.pdf and /dev/null differ
diff --git a/data/source/fournisseurs.pdf b/data/source/fournisseurs.pdf
deleted file mode 100644
index 3ae5f1a..0000000
Binary files a/data/source/fournisseurs.pdf and /dev/null differ
diff --git a/data/source/historique_commandes.pdf b/data/source/historique_commandes.pdf
deleted file mode 100644
index 6544f7e..0000000
Binary files a/data/source/historique_commandes.pdf and /dev/null differ
diff --git a/data/source/planning_production_mars_2025.pdf b/data/source/planning_production_mars_2025.pdf
deleted file mode 100644
index 18ca74c..0000000
Binary files a/data/source/planning_production_mars_2025.pdf and /dev/null differ
diff --git a/main.py b/main.py
index 5e0e420..9a228f9 100644
--- a/main.py
+++ b/main.py
@@ -1,49 +1,61 @@
+import os
+import glob
+from typing import List
+
from src.impl.datastore import Datastore, DataItem
from src.impl.indexer import Indexer
from src.impl.retriever import Retriever
from src.impl.response_generator import ResponseGenerator
-TEST_PATH = "data/source"
+
+from src.RAG_pipeline import RAGpipeline
+from create_parser import create_parser
+
+DEFAULT_SOURCE_PATH = "data/source/"
+DEFAULT_EVAL_PATH =""
+
+def create_pipeline() -> RAGpipeline:
+ indexer = Indexer()
+ datastore = Datastore()
+ retriever = Retriever(datastore= datastore)
+ response_generator = ResponseGenerator()
+ return RAGpipeline(indexer = indexer, datastore = datastore, retriever= retriever, response_generator= response_generator)
+
def main():
- query_graphiste = "Quel est le salaire brut mensuel du graphiste ?"
- query_graphiste_en = "What is the monthly gross salary of the graphist designer ?"
- query_tshirt = "Quel est le prix du Tshirt rouge avec le motif1 ?"
- print("Testing indexer")
- indexer = Indexer()
- items_from_indexer = indexer.index(["data/source/informations_entreprise.pdf",
- "data/source/bilan_comptable_2024.csv",
- "data/source/employes.csv",
- "data/source/facture_14_03_2025.pdf",
- "data/source/fournisseurs.csv",
- "data/source/historique_commandes.csv",
- "data/source/planning_production_mars_2025.csv",
- "data/source/stock_tshirt.csv"
- ])
+
+ parser = create_parser()
+ args = parser.parse_args()
+ pipeline = create_pipeline()
+
+ source_path = getattr(args, "path", DEFAULT_SOURCE_PATH) or DEFAULT_SOURCE_PATH
+ documents_path = get_files_in_directory(source_path=source_path)
+
+ #eval_path = args.eval_file if args.eval_file else DEFAULT_EVAL_PATH
+ #sample_questions = json.load(open(eval_path, "r"))
+
+ commands = {
+ "run": lambda: pipeline.run(documents_path = documents_path),
+ "reset": lambda: pipeline.reset(),
+ "add": lambda: pipeline.add_documents(documents_path=documents_path),
+ "evaluate": lambda: pipeline.evaluate(),
+ "query": lambda: print(pipeline.process_query(args.prompt)),
+ }
+
+ try:
+ commands[args.commands]()
+ except Exception as e:
+ print(f"â ERREUR: {e}")
+ import traceback
+ traceback.print_exc()
+
+ commands[args.commands]
+
+ return
+
+def get_files_in_directory(source_path: str) -> List[str]:
+ if os.path.isfile(source_path):
+ return [source_path]
+ return glob.glob(os.path.join(source_path, "*"))
-
- print("Testing datastore")
- datastore = Datastore()
- print(f"Model's maximum sequence length:{datastore.model.max_seq_length}")
- test_vector = datastore.create_vector("test")
-
- data_item_to_test = DataItem(
- content = "Data item being tested",
- source = "from a test"
- )
-
- datastore.add_items([data_item_to_test])
-
- datastore.add_items(items_from_indexer)
- print(datastore.search_datastore(("Data item being tested")))
- print(datastore.search_datastore("Red t-shirt"))
-
- print("Testing retriever")
- retriever = Retriever(datastore= datastore)
- #print(retriever.search_retriever(query_graphiste))
-
- print("Testing Response generator")
- response_generator = ResponseGenerator()
- print(response_generator.generate_response(query_graphiste, retriever.search_retriever(query_graphiste)))
- print("fin")
- exit
-
-main()
\ No newline at end of file
+if __name__ == "__main__":
+ main()
+
\ No newline at end of file
diff --git a/src/RAG_pipeline.py b/src/RAG_pipeline.py
index 5bee608..50f6ed0 100644
--- a/src/RAG_pipeline.py
+++ b/src/RAG_pipeline.py
@@ -16,13 +16,31 @@ class RAGpipeline:
evaluator: Optional[BaseEvaluator] = None
def reset(self) -> None:
+ print("đïž Resetting the database...")
self.datastore.reset_table()
-
- def add_documents(self, documents: List[str]) -> None:
- items = self.indexer.index(documents)
- self.datastore.add_items(items= items)
return
+ def add_documents(self, documents_path : List[str]) -> None:
+ items = self.indexer.index(documents_path)
+ #print(f"đ Adding documents: {', '.join(docu)}")
+ self.datastore.add_items(items= items)
+
+ return
+
+ def evaluate(self, arg1 = None) -> None:
+ """A compléter lors de l'ajout de l'évaluateur
+ """
+ print("Function not completed.")
+ return
+
+ def run(self, documents_path: List[str], arg2 = None) -> None:
+ self.reset()
+ self.add_documents(documents_path =documents_path)
+ if arg2 :
+ self.evaluate(arg2)
+ return
+
+
def process_query(self, query: str, top_k: int = 5, source: bool = False) -> str:
"""GĂ©nĂšre la rĂ©ponse Ă la requĂȘte Ă partir du contexte rĂ©cupĂ©rĂ©.
Affiche éventuellement les sources.
@@ -33,6 +51,7 @@ class RAGpipeline:
Returns:
str: Réponse générée
"""
+ print(f"[DEBUG] Traitement de la requĂȘte: {query}")
context = self.retriever.search_retriever(query, top_k= top_k)
if( source):
@@ -40,10 +59,6 @@ class RAGpipeline:
print(f"Based on the document {i+1}: {doc} \n")
response = self.response_generator.generate_response(query= query, context= context)
+ #print(f"Reponse: {response}")
return response
- def evaluate(self) -> None:
- """A compléter lors de l'ajout de l'évaluateur
- """
- print("Function not completed.")
- pass
\ No newline at end of file
diff --git a/src/impl/evaluator.py b/src/impl/evaluator.py
index f17bea4..a5271ee 100644
--- a/src/impl/evaluator.py
+++ b/src/impl/evaluator.py
@@ -3,7 +3,7 @@ import requests
class Evaluator(BaseEvaluator):
- def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"):
+ def __init__(self, model_name: str = "llama3.2:8b", base_url: str = "http://localhost:11434"):
self.base_url = base_url
self.model_name = model_name
@@ -23,7 +23,7 @@ Then return the result in ... tags â either as 'true' or 'fal
def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult:
user_prompt = f"""
- \n{query}
+ \n{query}
\n{response}
\n{expected_answer}
"""
diff --git a/src/impl/response_generator.py b/src/impl/response_generator.py
index 7b6ce97..65cc7cc 100644
--- a/src/impl/response_generator.py
+++ b/src/impl/response_generator.py
@@ -2,7 +2,7 @@ from typing import List, Optional
from ..interface.base_response_generator import BaseResponseGenerator
from groq import Groq
import os
-
+from dotenv import load_dotenv
SYSTEM_PROMPT = """Tu es un assistant intelligent qui répond aux questions en te basant sur le contexte fourni.
@@ -17,8 +17,10 @@ RĂšgles importantes:
class ResponseGenerator(BaseResponseGenerator):
def __init__(self, api_key: Optional[str] = None):
- try :
+ try :
+ load_dotenv()
self.api_key = api_key or os.getenv("GROQ_API_KEY")
+ print("Clé API récupérée avec succÚs \n")
except Exception as e:
raise ValueError(f"erreur avec la clé API: {e}")
diff --git a/src/interface/base_datastore.py b/src/interface/base_datastore.py
index be6bd86..32b808b 100644
--- a/src/interface/base_datastore.py
+++ b/src/interface/base_datastore.py
@@ -1,7 +1,7 @@
from abc import ABC,abstractmethod
from typing import List
from pydantic import BaseModel
-
+from lancedb.table import Table
class DataItem(BaseModel):
content: str = ""
@@ -9,6 +9,10 @@ class DataItem(BaseModel):
class BaseDatastore(ABC):
+ @abstractmethod
+ def reset_table(self) -> Table:
+ pass
+
@abstractmethod
def add_items(self, items: List[DataItem]) -> None:
pass