from chromadb.config import Settings from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceEmbeddings from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma import os import requests from fastapi import FastAPI, UploadFile, File from typing import List, Optional import urllib.parse from langchain.llms import HuggingFacePipeline from transformers import AutoTokenizer, AutoModelForCausalLM import transformers from torch import cuda, bfloat16 import gradio as gr import gc import torch import glob from multiprocessing import Pool persist_directory = "db" source_directory = 'source_documents' embeddings_model_name = "all-MiniLM-L6-v2" model = "tiiuae/falcon-7b-instruct" chunk_size = 500 chunk_overlap = 50 target_source_chunks = 4 # Define the folder for storing database persist_directory = 'db' embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) llm = HuggingFacePipeline.from_model_id(model_id=model, task="text-generation", device=0, model_kwargs={"temperature":0.1,"trust_remote_code": True, "max_length":100000, "top_p":0.15, "top_k":0, "repetition_penalty":1.1, "num_return_sequences":1, "torch_dtype":bfloat16}) # Define the Chroma settings CHROMA_SETTINGS = Settings( chroma_db_impl='duckdb+parquet', persist_directory=persist_directory, anonymized_telemetry=False ) import os import glob from typing import List import argparse from langchain.document_loaders import ( CSVLoader, EverNoteLoader, PDFMinerLoader, TextLoader, UnstructuredEmailLoader, UnstructuredEPubLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader, UnstructuredODTLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document # from constants import CHROMA_SETTINGS # from PyPDF2 import PdfReader import requests # Map file extensions to document loaders and their arguments LOADER_MAPPING = { ".csv": (CSVLoader, {}), # ".docx": (Docx2txtLoader, {}), ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), # ".eml": (MyElmLoader, {}), ".epub": (UnstructuredEPubLoader, {}), ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}), ".odt": (UnstructuredODTLoader, {}), ".pdf": (PDFMinerLoader, {}), ".ppt": (UnstructuredPowerPointLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "cp1252"}), # Add more mappings for other file extensions and loaders as needed } def load_single_document(file_path: str) -> List[Document]: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: loader_class, loader_args = LOADER_MAPPING[ext] loader = loader_class(file_path, **loader_args) return loader.load() raise ValueError(f"Unsupported file extension '{ext}'") def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: """ Loads all documents from the source documents directory, ignoring specified files """ all_files = [] for ext in LOADER_MAPPING: all_files.extend( glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) ) filtered_files = [file_path for file_path in all_files if file_path not in ignored_files] with Pool(processes=os.cpu_count()) as pool: results = [] with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): results.extend(docs) pbar.update() return results def process_documents(ignored_files: List[str] = []) -> List[Document]: """ Load documents and split in chunks """ print(f"Loading documents from {source_directory}") documents = load_documents(source_directory, ignored_files) if not documents: print("No new documents to load") exit(0) print(f"Loaded {len(documents)} new documents from {source_directory}") text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_documents(documents) print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") return texts def does_vectorstore_exist(persist_directory: str) -> bool: """ Checks if vectorstore exists """ if os.path.exists(os.path.join(persist_directory, 'index')): if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')): list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) # At least 3 documents are needed in a working vectorstore if len(list_index_files) > 3: return True return False def ingest(): # Load environment variables embeddings_model_name = "all-MiniLM-L6-v2" persist_directory = "db" model = "tiiuae/falcon-7b-instruct" source_directory = "source_documents" os.makedirs(source_directory, exist_ok=True) # Load documents and split in chunks print(f"Loading documents from {source_directory}") chunk_size = 500 chunk_overlap = 50 documents = load_documents(source_directory) text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_documents(documents) print(f"Loaded {len(documents)} documents from {source_directory}") print(f"Split into {len(texts)} chunks of text (max. {chunk_size} characters each)") # Create embeddings # embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) # Create and store locally vectorstore db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) db.persist() db = None def embed_documents(files): saved_files = [] source_directory = "source_documents" # print(files) # Save the files to the specified folder for file_ in files: print(type(file_)) os.makedirs(source_directory, exist_ok= True) filename = "file.pdf" file_path = os.path.join(source_directory, filename) saved_files.append(file_path) print(type(file_)) print(file_path) # file_content = file_.read() with open(file_path, "wb") as f: print("write") f.write(file_) ingest() # Delete the contents of the folder [os.remove(os.path.join(source_directory, filename)) or os.path.join(source_directory, filename) for file in files] return {"message": "Files embedded successfully"} def retrieve_documents(query: str): target_source_chunks = 4 mute_stream = "" embeddings_model_name = "all-MiniLM-L6-v2" db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) retriever = db.as_retriever(search_kwargs={"k": target_source_chunks}) # Prepare the LLM callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()] qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False) # Get the answer from the chain res = qa(query) print(res) answer = res['result'] torch.cuda.empty_cache() gc.collect() return answer with gr.Blocks() as demo: with gr.Row(): with gr.Column(): file_input = gr.File(file_count="multiple", file_types=["text", ".json", ".csv", ".pdf"], type= 'binary') initiate_btn = gr.Button(value="Generate Embedding") with gr.Column(): question = gr.Textbox(label="Question") question_btn = gr.Button(value="Question_btn") answer = gr.Textbox(label="answer") initiate_btn.click(embed_documents, inputs=file_input, api_name="embed-file") question_btn.click(retrieve_documents, inputs=question , outputs=answer, api_name="llm") demo.queue().launch()