RishuD7 commited on
Commit
7f80677
1 Parent(s): 84a1d33

first commit

Browse files
Files changed (2) hide show
  1. app.py +236 -0
  2. requirements.txt +17 -0
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from chromadb.config import Settings
3
+
4
+ from langchain.chains import RetrievalQA
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
7
+ from langchain.vectorstores import Chroma
8
+ import os
9
+ import requests
10
+ from fastapi import FastAPI, UploadFile, File
11
+ from typing import List, Optional
12
+ import urllib.parse
13
+ from langchain.llms import HuggingFacePipeline
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
+ import transformers
16
+ from torch import cuda, bfloat16
17
+ import gradio as gr
18
+ import gc
19
+ import torch
20
+
21
+
22
+ persist_directory = "db"
23
+ source_directory = 'source_documents'
24
+ embeddings_model_name = "all-MiniLM-L6-v2"
25
+ model = "tiiuae/falcon-7b-instruct"
26
+ chunk_size = 500
27
+ chunk_overlap = 50
28
+ target_source_chunks = 4
29
+ # Define the folder for storing database
30
+ persist_directory = 'db'
31
+
32
+
33
+ embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
34
+ llm = HuggingFacePipeline.from_model_id(model_id=model, task="text-generation", device=0, model_kwargs={"temperature":0.1,"trust_remote_code": True, "max_length":100000, "top_p":0.15, "top_k":0, "repetition_penalty":1.1, "num_return_sequences":1, "torch_dtype":bfloat16})
35
+
36
+
37
+ # Define the Chroma settings
38
+ CHROMA_SETTINGS = Settings(
39
+ chroma_db_impl='duckdb+parquet',
40
+ persist_directory=persist_directory,
41
+ anonymized_telemetry=False
42
+ )
43
+
44
+ import os
45
+ import glob
46
+ from typing import List
47
+ import argparse
48
+
49
+ from langchain.document_loaders import (
50
+ CSVLoader,
51
+ EverNoteLoader,
52
+ PDFMinerLoader,
53
+ TextLoader,
54
+ UnstructuredEmailLoader,
55
+ UnstructuredEPubLoader,
56
+ UnstructuredHTMLLoader,
57
+ UnstructuredMarkdownLoader,
58
+ UnstructuredODTLoader,
59
+ UnstructuredPowerPointLoader,
60
+ UnstructuredWordDocumentLoader,
61
+ )
62
+
63
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
64
+ from langchain.vectorstores import Chroma
65
+ from langchain.embeddings import HuggingFaceEmbeddings
66
+ from langchain.docstore.document import Document
67
+ # from constants import CHROMA_SETTINGS
68
+ # from PyPDF2 import PdfReader
69
+ import requests
70
+
71
+ # Map file extensions to document loaders and their arguments
72
+ LOADER_MAPPING = {
73
+ ".csv": (CSVLoader, {}),
74
+ # ".docx": (Docx2txtLoader, {}),
75
+ ".doc": (UnstructuredWordDocumentLoader, {}),
76
+ ".docx": (UnstructuredWordDocumentLoader, {}),
77
+ ".enex": (EverNoteLoader, {}),
78
+ # ".eml": (MyElmLoader, {}),
79
+ ".epub": (UnstructuredEPubLoader, {}),
80
+ ".html": (UnstructuredHTMLLoader, {}),
81
+ ".md": (UnstructuredMarkdownLoader, {}),
82
+ ".odt": (UnstructuredODTLoader, {}),
83
+ ".pdf": (PDFMinerLoader, {}),
84
+ ".ppt": (UnstructuredPowerPointLoader, {}),
85
+ ".pptx": (UnstructuredPowerPointLoader, {}),
86
+ ".txt": (TextLoader, {"encoding": "cp1252"}),
87
+ # Add more mappings for other file extensions and loaders as needed
88
+ }
89
+
90
+
91
+ def load_single_document(file_path: str) -> List[Document]:
92
+ ext = "." + file_path.rsplit(".", 1)[-1]
93
+ if ext in LOADER_MAPPING:
94
+ loader_class, loader_args = LOADER_MAPPING[ext]
95
+ loader = loader_class(file_path, **loader_args)
96
+ return loader.load()
97
+
98
+ raise ValueError(f"Unsupported file extension '{ext}'")
99
+
100
+ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
101
+ """
102
+ Loads all documents from the source documents directory, ignoring specified files
103
+ """
104
+ all_files = []
105
+ for ext in LOADER_MAPPING:
106
+ all_files.extend(
107
+ glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
108
+ )
109
+ filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
110
+
111
+ with Pool(processes=os.cpu_count()) as pool:
112
+ results = []
113
+ with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
114
+ for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
115
+ results.extend(docs)
116
+ pbar.update()
117
+
118
+ return results
119
+
120
+ def process_documents(ignored_files: List[str] = []) -> List[Document]:
121
+ """
122
+ Load documents and split in chunks
123
+ """
124
+ print(f"Loading documents from {source_directory}")
125
+ documents = load_documents(source_directory, ignored_files)
126
+ if not documents:
127
+ print("No new documents to load")
128
+ exit(0)
129
+ print(f"Loaded {len(documents)} new documents from {source_directory}")
130
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
131
+ texts = text_splitter.split_documents(documents)
132
+ print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
133
+ return texts
134
+
135
+ def does_vectorstore_exist(persist_directory: str) -> bool:
136
+ """
137
+ Checks if vectorstore exists
138
+ """
139
+ if os.path.exists(os.path.join(persist_directory, 'index')):
140
+ if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
141
+ list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
142
+ list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
143
+ # At least 3 documents are needed in a working vectorstore
144
+ if len(list_index_files) > 3:
145
+ return True
146
+ return False
147
+
148
+ def ingest():
149
+ # Load environment variables
150
+ embeddings_model_name = "all-MiniLM-L6-v2"
151
+ persist_directory = "db"
152
+ model = "tiiuae/falcon-7b-instruct"
153
+ source_directory = "source_documents"
154
+ os.makedirs(source_directory, exist_ok=True)
155
+ # Load documents and split in chunks
156
+ print(f"Loading documents from {source_directory}")
157
+ chunk_size = 500
158
+ chunk_overlap = 50
159
+ documents = load_documents(source_directory)
160
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
161
+ texts = text_splitter.split_documents(documents)
162
+ print(f"Loaded {len(documents)} documents from {source_directory}")
163
+ print(f"Split into {len(texts)} chunks of text (max. {chunk_size} characters each)")
164
+
165
+ # Create embeddings
166
+ # embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
167
+
168
+ # Create and store locally vectorstore
169
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
170
+ db.persist()
171
+ db = None
172
+
173
+ def embed_documents(files):
174
+
175
+ saved_files = []
176
+ source_directory = "source_documents"
177
+
178
+ # print(files)
179
+ # Save the files to the specified folder
180
+ for file_ in files:
181
+ print(type(file_))
182
+ os.makedirs(source_directory, exist_ok= True)
183
+ filename = "file.pdf"
184
+
185
+ file_path = os.path.join(source_directory, filename)
186
+ saved_files.append(file_path)
187
+
188
+ print(type(file_))
189
+ print(file_path)
190
+ # file_content = file_.read()
191
+ with open(file_path, "wb") as f:
192
+ print("write")
193
+ f.write(file_)
194
+ ingest()
195
+
196
+ # Delete the contents of the folder
197
+ [os.remove(os.path.join(source_directory, filename)) or os.path.join(source_directory, filename) for file in files]
198
+
199
+ return {"message": "Files embedded successfully"}
200
+
201
+ def retrieve_documents(query: str):
202
+ target_source_chunks = 4
203
+ mute_stream = ""
204
+ embeddings_model_name = "all-MiniLM-L6-v2"
205
+
206
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
207
+ retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
208
+ # Prepare the LLM
209
+ callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()]
210
+
211
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)
212
+
213
+ # Get the answer from the chain
214
+ res = qa(query)
215
+ print(res)
216
+ answer = res['result']
217
+ torch.cuda.empty_cache()
218
+ gc.collect()
219
+ return answer
220
+
221
+ with gr.Blocks() as demo:
222
+ with gr.Row():
223
+ with gr.Column():
224
+ file_input = gr.File(file_count="multiple", file_types=["text", ".json", ".csv", ".pdf"], type= 'binary')
225
+ initiate_btn = gr.Button(value="Generate Embedding")
226
+
227
+ with gr.Column():
228
+ question = gr.Textbox(label="Question")
229
+ question_btn = gr.Button(value="Question_btn")
230
+ answer = gr.Textbox(label="answer")
231
+
232
+ initiate_btn.click(embed_documents, inputs=file_input, api_name="embed-file")
233
+
234
+ question_btn.click(retrieve_documents, inputs=question , outputs=answer, api_name="llm")
235
+
236
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ einops
3
+ accelerate
4
+ xformers
5
+ chromadb==0.3.26
6
+ duckdb==0.8.0
7
+ pdfminer.six==20221105
8
+ unstructured==0.6.6
9
+ extract-msg==0.41.1
10
+ tabulate==0.9.0
11
+ pandoc==2.3
12
+ pypandoc==1.11
13
+ langchain==0.0.177
14
+ streamlit
15
+ sentence_transformers
16
+ gradio
17
+ PyPDF2==3.0.1