diff --git a/.gitignore b/.gitignore index f64ffe5..391c593 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ venv/ # Logs *.log +.env diff --git a/create_parser.py b/create_parser.py new file mode 100644 index 0000000..0e2cd5c --- /dev/null +++ b/create_parser.py @@ -0,0 +1,63 @@ +import argparse + +def create_parser(): + parser = argparse.ArgumentParser( + description="RAG pipeline CLI" + ) + + path_arg_parent = argparse.ArgumentParser(add_help=False) + path_arg_parent.add_argument( + "-p", + "--path", + type=str, + required=False, + help="Path to a directory containing documents to index.", + ) + + eval_file_arg_parent = argparse.ArgumentParser(add_help=False) + eval_file_arg_parent.add_argument( + "-f", + "--eval_file", + type=str, + required=False, + help="Path to a .json file with question/expected_answer pairs.", + ) + + + subparsers = parser.add_subparsers(dest="commands", help="Enter a command", required=True) + + subparsers.add_parser( + "run", + help="Run the full pipeline, i.e Reset, add and evaluate", + parents=[path_arg_parent, eval_file_arg_parent] + ) + + subparsers.add_parser( + "reset", + help="Reset the pipeline" + ) + + subparsers.add_parser( + "add", + help="Add the documents to the databaser", + parents=[path_arg_parent] + ) + + subparsers.add_parser( + "evaluate", + help="Evaluate the model", + parents=[eval_file_arg_parent] + ) + + query_parser = subparsers.add_parser( + "query", + help="Query the documents" + ) + + query_parser.add_argument( + "prompt", + type=str, + help="Enter the query" + ) + + return parser \ No newline at end of file diff --git a/data/source/bilan_comptable_2024.pdf b/data/source/bilan_comptable_2024.pdf deleted file mode 100644 index 9ae35ae..0000000 Binary files a/data/source/bilan_comptable_2024.pdf and /dev/null differ diff --git a/data/source/database.pdf b/data/source/database.pdf deleted file mode 100644 index 141948d..0000000 Binary files a/data/source/database.pdf and /dev/null differ diff --git a/data/source/employes.pdf b/data/source/employes.pdf deleted file mode 100644 index fa0f565..0000000 Binary files a/data/source/employes.pdf and /dev/null differ diff --git a/data/source/fournisseurs.pdf b/data/source/fournisseurs.pdf deleted file mode 100644 index 3ae5f1a..0000000 Binary files a/data/source/fournisseurs.pdf and /dev/null differ diff --git a/data/source/historique_commandes.pdf b/data/source/historique_commandes.pdf deleted file mode 100644 index 6544f7e..0000000 Binary files a/data/source/historique_commandes.pdf and /dev/null differ diff --git a/data/source/planning_production_mars_2025.pdf b/data/source/planning_production_mars_2025.pdf deleted file mode 100644 index 18ca74c..0000000 Binary files a/data/source/planning_production_mars_2025.pdf and /dev/null differ diff --git a/main.py b/main.py index 5e0e420..9a228f9 100644 --- a/main.py +++ b/main.py @@ -1,49 +1,61 @@ +import os +import glob +from typing import List + from src.impl.datastore import Datastore, DataItem from src.impl.indexer import Indexer from src.impl.retriever import Retriever from src.impl.response_generator import ResponseGenerator -TEST_PATH = "data/source" + +from src.RAG_pipeline import RAGpipeline +from create_parser import create_parser + +DEFAULT_SOURCE_PATH = "data/source/" +DEFAULT_EVAL_PATH ="" + +def create_pipeline() -> RAGpipeline: + indexer = Indexer() + datastore = Datastore() + retriever = Retriever(datastore= datastore) + response_generator = ResponseGenerator() + return RAGpipeline(indexer = indexer, datastore = datastore, retriever= retriever, response_generator= response_generator) + def main(): - query_graphiste = "Quel est le salaire brut mensuel du graphiste ?" - query_graphiste_en = "What is the monthly gross salary of the graphist designer ?" - query_tshirt = "Quel est le prix du Tshirt rouge avec le motif1 ?" - print("Testing indexer") - indexer = Indexer() - items_from_indexer = indexer.index(["data/source/informations_entreprise.pdf", - "data/source/bilan_comptable_2024.csv", - "data/source/employes.csv", - "data/source/facture_14_03_2025.pdf", - "data/source/fournisseurs.csv", - "data/source/historique_commandes.csv", - "data/source/planning_production_mars_2025.csv", - "data/source/stock_tshirt.csv" - ]) + + parser = create_parser() + args = parser.parse_args() + pipeline = create_pipeline() + + source_path = getattr(args, "path", DEFAULT_SOURCE_PATH) or DEFAULT_SOURCE_PATH + documents_path = get_files_in_directory(source_path=source_path) + + #eval_path = args.eval_file if args.eval_file else DEFAULT_EVAL_PATH + #sample_questions = json.load(open(eval_path, "r")) + + commands = { + "run": lambda: pipeline.run(documents_path = documents_path), + "reset": lambda: pipeline.reset(), + "add": lambda: pipeline.add_documents(documents_path=documents_path), + "evaluate": lambda: pipeline.evaluate(), + "query": lambda: print(pipeline.process_query(args.prompt)), + } + + try: + commands[args.commands]() + except Exception as e: + print(f"❌ ERREUR: {e}") + import traceback + traceback.print_exc() + + commands[args.commands] + + return + +def get_files_in_directory(source_path: str) -> List[str]: + if os.path.isfile(source_path): + return [source_path] + return glob.glob(os.path.join(source_path, "*")) - - print("Testing datastore") - datastore = Datastore() - print(f"Model's maximum sequence length:{datastore.model.max_seq_length}") - test_vector = datastore.create_vector("test") - - data_item_to_test = DataItem( - content = "Data item being tested", - source = "from a test" - ) - - datastore.add_items([data_item_to_test]) - - datastore.add_items(items_from_indexer) - print(datastore.search_datastore(("Data item being tested"))) - print(datastore.search_datastore("Red t-shirt")) - - print("Testing retriever") - retriever = Retriever(datastore= datastore) - #print(retriever.search_retriever(query_graphiste)) - - print("Testing Response generator") - response_generator = ResponseGenerator() - print(response_generator.generate_response(query_graphiste, retriever.search_retriever(query_graphiste))) - print("fin") - exit - -main() \ No newline at end of file +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/src/RAG_pipeline.py b/src/RAG_pipeline.py index 5bee608..50f6ed0 100644 --- a/src/RAG_pipeline.py +++ b/src/RAG_pipeline.py @@ -16,13 +16,31 @@ class RAGpipeline: evaluator: Optional[BaseEvaluator] = None def reset(self) -> None: + print("đŸ—‘ïž Resetting the database...") self.datastore.reset_table() - - def add_documents(self, documents: List[str]) -> None: - items = self.indexer.index(documents) - self.datastore.add_items(items= items) return + def add_documents(self, documents_path : List[str]) -> None: + items = self.indexer.index(documents_path) + #print(f"🔍 Adding documents: {', '.join(docu)}") + self.datastore.add_items(items= items) + + return + + def evaluate(self, arg1 = None) -> None: + """A complĂ©ter lors de l'ajout de l'Ă©valuateur + """ + print("Function not completed.") + return + + def run(self, documents_path: List[str], arg2 = None) -> None: + self.reset() + self.add_documents(documents_path =documents_path) + if arg2 : + self.evaluate(arg2) + return + + def process_query(self, query: str, top_k: int = 5, source: bool = False) -> str: """GĂ©nĂšre la rĂ©ponse Ă  la requĂȘte Ă  partir du contexte rĂ©cupĂ©rĂ©. Affiche Ă©ventuellement les sources. @@ -33,6 +51,7 @@ class RAGpipeline: Returns: str: RĂ©ponse gĂ©nĂ©rĂ©e """ + print(f"[DEBUG] Traitement de la requĂȘte: {query}") context = self.retriever.search_retriever(query, top_k= top_k) if( source): @@ -40,10 +59,6 @@ class RAGpipeline: print(f"Based on the document {i+1}: {doc} \n") response = self.response_generator.generate_response(query= query, context= context) + #print(f"Reponse: {response}") return response - def evaluate(self) -> None: - """A complĂ©ter lors de l'ajout de l'Ă©valuateur - """ - print("Function not completed.") - pass \ No newline at end of file diff --git a/src/impl/evaluator.py b/src/impl/evaluator.py index f17bea4..a5271ee 100644 --- a/src/impl/evaluator.py +++ b/src/impl/evaluator.py @@ -3,7 +3,7 @@ import requests class Evaluator(BaseEvaluator): - def __init__(self, model_name: str = "llama3.2:3b", base_url: str = "http://localhost:11434"): + def __init__(self, model_name: str = "llama3.2:8b", base_url: str = "http://localhost:11434"): self.base_url = base_url self.model_name = model_name @@ -23,7 +23,7 @@ Then return the result in ... tags — either as 'true' or 'fal def evaluate(self, query: str, response: str, expected_answer: str) -> EvaluationResult: user_prompt = f""" - \n{query} + \n{query} \n{response} \n{expected_answer} """ diff --git a/src/impl/response_generator.py b/src/impl/response_generator.py index 7b6ce97..65cc7cc 100644 --- a/src/impl/response_generator.py +++ b/src/impl/response_generator.py @@ -2,7 +2,7 @@ from typing import List, Optional from ..interface.base_response_generator import BaseResponseGenerator from groq import Groq import os - +from dotenv import load_dotenv SYSTEM_PROMPT = """Tu es un assistant intelligent qui rĂ©pond aux questions en te basant sur le contexte fourni. @@ -17,8 +17,10 @@ RĂšgles importantes: class ResponseGenerator(BaseResponseGenerator): def __init__(self, api_key: Optional[str] = None): - try : + try : + load_dotenv() self.api_key = api_key or os.getenv("GROQ_API_KEY") + print("ClĂ© API rĂ©cupĂ©rĂ©e avec succĂšs \n") except Exception as e: raise ValueError(f"erreur avec la clĂ© API: {e}") diff --git a/src/interface/base_datastore.py b/src/interface/base_datastore.py index be6bd86..32b808b 100644 --- a/src/interface/base_datastore.py +++ b/src/interface/base_datastore.py @@ -1,7 +1,7 @@ from abc import ABC,abstractmethod from typing import List from pydantic import BaseModel - +from lancedb.table import Table class DataItem(BaseModel): content: str = "" @@ -9,6 +9,10 @@ class DataItem(BaseModel): class BaseDatastore(ABC): + @abstractmethod + def reset_table(self) -> Table: + pass + @abstractmethod def add_items(self, items: List[DataItem]) -> None: pass